diff options
author | Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-27 17:46:17 +0100 |
---|---|---|
committer | felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-28 12:08:05 +0000 |
commit | afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch) | |
tree | 03bc7d5a762099989b16a656fa8d397b490ed70e /src/core | |
parent | bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff) | |
download | ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz |
Apply clang-format on repository
Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.
Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/
There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.
Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/core')
286 files changed, 9956 insertions, 7901 deletions
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp index ca2f7d238f..52be6990ab 100644 --- a/src/core/AccessWindowAutoPadding.cpp +++ b/src/core/AccessWindowAutoPadding.cpp @@ -28,12 +28,14 @@ using namespace arm_compute; -AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info) - : _info(info) +AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info) : _info(info) { } -ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const +ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const { ARM_COMPUTE_UNUSED(window); ARM_COMPUTE_UNUSED(input_valid_region); @@ -45,17 +47,17 @@ ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion AccessWindowAutoPadding::compute_valid_region() const { - if(_info == nullptr) + if (_info == nullptr) { return ValidRegion{}; } - return ValidRegion{ Coordinates(), _info->tensor_shape() }; + return ValidRegion{Coordinates(), _info->tensor_shape()}; } void AccessWindowAutoPadding::set_valid_region() { - if(_info == nullptr) + if (_info == nullptr) { return; } @@ -75,7 +77,7 @@ bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window) ARM_COMPUTE_UNUSED(window); // Only update the padding if the tensor allows it - if(_info == nullptr || !_info->is_resizable()) + if (_info == nullptr || !_info->is_resizable()) { return false; } diff --git a/src/core/AccessWindowAutoPadding.h b/src/core/AccessWindowAutoPadding.h index b8d1508679..406bdba0d8 100644 --- a/src/core/AccessWindowAutoPadding.h +++ b/src/core/AccessWindowAutoPadding.h @@ -74,9 +74,12 @@ public: ValidRegion compute_valid_region() const; // Inherited methods overridden: - bool update_window_if_needed(Window &window) const override; - bool update_padding_if_needed(const Window &window) override; - ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; + bool update_window_if_needed(Window &window) const override; + bool update_padding_if_needed(const Window &window) override; + ValidRegion compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const override; private: ITensorInfo *_info; diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp index 0607011bc5..98182b1202 100644 --- a/src/core/AccessWindowStatic.cpp +++ b/src/core/AccessWindowStatic.cpp @@ -34,7 +34,10 @@ AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start { } -ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const +ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const { ARM_COMPUTE_UNUSED(border_undefined); ARM_COMPUTE_UNUSED(border_size); @@ -44,7 +47,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const { - if(_info == nullptr) + if (_info == nullptr) { return input_valid_region; } @@ -57,7 +60,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid // Start of the valid region is equal to the start of the static access but // never outside of the tensor. anchor.set(0, std::max<int>(0, _start_x)); - if(_info->num_dimensions() > 1) + if (_info->num_dimensions() > 1) { anchor.set(1, std::max<int>(0, _start_y)); } @@ -65,7 +68,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid // End of the valid region is equal to the end of the static access but // never outside of the tensor. shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0])); - if(_info->num_dimensions() > 1) + if (_info->num_dimensions() > 1) { shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1])); } @@ -75,7 +78,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegion &input_valid_region) { - if(_info != nullptr) + if (_info != nullptr) { _info->set_valid_region(compute_valid_region(window, input_valid_region)); } @@ -84,7 +87,7 @@ void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegio bool AccessWindowStatic::update_window_if_needed(Window &window) const { // If the padding is not enough and the tensor is not resizable, shrink the window to size 0 - if(_info == nullptr || _info->is_resizable()) + if (_info == nullptr || _info->is_resizable()) { return false; } @@ -96,48 +99,50 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const bool window_modified = false; // Calculate if padding is enough - if(_start_y < 0) + if (_start_y < 0) { const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]); - if(_start_y < front_pad_y_available) + if (_start_y < front_pad_y_available) { window_modified = true; } } - if(!window_modified) + if (!window_modified) { - if(_end_y > static_cast<int>(shape[1])) + if (_end_y > static_cast<int>(shape[1])) { const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size(); const int tail_pad_y_available = (stride_z / strides[1]) - shape[1]; - if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y) + if (static_cast<int>(shape[1]) + tail_pad_y_available < _end_y) { window_modified = true; } } - if(!window_modified) + if (!window_modified) { const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size(); - if(_start_x < 0) + if (_start_x < 0) { - const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]); + const int front_pad_x_available = + -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) / + static_cast<int>(strides[0]); - if(_start_x < front_pad_x_available) + if (_start_x < front_pad_x_available) { window_modified = true; } } - if(!window_modified && _end_x > static_cast<int>(shape[0])) + if (!window_modified && _end_x > static_cast<int>(shape[0])) { const int tail_pad_x_available = (stride_y / strides[0]) - shape[0]; - if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x) + if (static_cast<int>(shape[0]) + tail_pad_x_available < _end_x) { window_modified = true; } @@ -146,9 +151,9 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const } // If padding is not enough - if(window_modified) + if (window_modified) { - for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i) { window.set(i, Window::Dimension(0, 0, 1)); } @@ -162,7 +167,7 @@ bool AccessWindowStatic::update_padding_if_needed(const Window &window) ARM_COMPUTE_UNUSED(window); // Only update the padding if the tensor allows it - if(_info == nullptr || !_info->is_resizable()) + if (_info == nullptr || !_info->is_resizable()) { return false; } diff --git a/src/core/AccessWindowStatic.h b/src/core/AccessWindowStatic.h index f7d43cbb55..5c6d2c7db0 100644 --- a/src/core/AccessWindowStatic.h +++ b/src/core/AccessWindowStatic.h @@ -86,9 +86,12 @@ public: ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const; // Inherited methods overriden: - bool update_window_if_needed(Window &window) const override; - bool update_padding_if_needed(const Window &window) override; - ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; + bool update_window_if_needed(Window &window) const override; + bool update_padding_if_needed(const Window &window) override; + ValidRegion compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const override; private: ITensorInfo *_info; diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp index d8bd4c4de1..42f0081c14 100644 --- a/src/core/AccessWindowTranspose.cpp +++ b/src/core/AccessWindowTranspose.cpp @@ -29,9 +29,12 @@ using namespace arm_compute; -ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const +ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const { - if(_info == nullptr) + if (_info == nullptr) { return input_valid_region; } @@ -41,7 +44,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va Coordinates old_anchor(anchor); TensorShape old_shape(shape); - if(!border_undefined) + if (!border_undefined) { border_size = BorderSize(0); } @@ -53,7 +56,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va // the kernel to write back output values. // As the relation between input and output is transposed window.y() is // used for x anchor and window.x() for y anchor. - if(_info->dimension(0) > 1) + if (_info->dimension(0) > 1) { anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x); } @@ -69,15 +72,19 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va // a size of the region. // As the relation between input and output is transposed window.y() is // used for x shape and window.x() for y shape. - if(_info->dimension(0) > 1) + if (_info->dimension(0) > 1) { - shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]); + shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, + (window.y().end() - window.y().step()) * _scale_x + _width) - + anchor[0]); } - shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]); + shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, + (window.x().end() - window.x().step()) * _scale_y + _height) - + anchor[1]); // For higher dimensions use the intersection of the window size and the // valid region of the input - for(size_t d = 2; d < _info->num_dimensions(); ++d) + for (size_t d = 2; d < _info->num_dimensions(); ++d) { anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d])); shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]); @@ -89,7 +96,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va bool AccessWindowTranspose::update_window_if_needed(Window &window) const { // Only update the window size if we can't use padding - if(_info == nullptr || _info->is_resizable()) + if (_info == nullptr || _info->is_resizable()) { return false; } @@ -107,12 +114,12 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const const int max_y = window.x().end() * _scale_y + _y; // Adjust window start for output's Y dimension (so X in (input) window) - if(min_y < 0) + if (min_y < 0) { // Calculate rows available above the tensor const int front_pad_y_available = -offset_first_element / strides[1]; - if(min_y < front_pad_y_available) + if (min_y < front_pad_y_available) { // Not enough padding available, need to shrink the window const int start = adjust_up(min_y, front_pad_y_available, window.x().step() * _scale_y) - _y; @@ -126,17 +133,18 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const } // Adjust window end for Y dimension - if(max_y > static_cast<int>(shape[1])) + if (max_y > static_cast<int>(shape[1])) { const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size(); // Calculate rows available below the tensor const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y; - if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y) + if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y) { // Not enough padding available, need to shrink the window - const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + window.x().step() * _scale_y - _y - _height; + const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + + window.x().step() * _scale_y - _y - _height; window.set(0, Window::Dimension(window.x().start(), end / _scale_y, window.x().step())); window_modified = true; } @@ -151,11 +159,14 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size(); // Adjust window start for X dimension - if(min_x < 0) + if (min_x < 0) { - const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]); + const int front_pad_x_available = + -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], + stride_y - shape[0] * strides[0]) / + static_cast<int>(strides[0]); - if(min_x < front_pad_x_available) + if (min_x < front_pad_x_available) { // Not enough padding available, need to shrink the window const int start = adjust_up(min_x, front_pad_x_available, window.y().step() * _scale_x) - _x; @@ -168,14 +179,15 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const } // Adjust window end for X dimension - if(max_x > static_cast<int>(shape[0])) + if (max_x > static_cast<int>(shape[0])) { const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x; - if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x) + if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x) { // Not enough padding available, need to shrink the window - const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + window.y().step() * _scale_x - _x - _width; + const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + + window.y().step() * _scale_x - _x - _width; window.set(1, Window::Dimension(window.y().start(), end / _scale_x, window.y().step())); window_modified = true; } @@ -189,7 +201,7 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const bool AccessWindowTranspose::update_padding_if_needed(const Window &window) { // Only update the padding if the tensor allows it - if(_info == nullptr || !_info->is_resizable()) + if (_info == nullptr || !_info->is_resizable()) { return false; } diff --git a/src/core/AccessWindowTranspose.h b/src/core/AccessWindowTranspose.h index 0306076d6e..12bb9a535b 100644 --- a/src/core/AccessWindowTranspose.h +++ b/src/core/AccessWindowTranspose.h @@ -42,7 +42,10 @@ public: bool update_window_if_needed(Window &window) const override; bool update_padding_if_needed(const Window &window) override; using AccessWindowRectangle::compute_valid_region; - ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; + ValidRegion compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const override; }; } // namespace arm_compute #endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/ diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/core/CL/CLCommandBuffer.cpp index 7fcfdf2c89..d094dcdaea 100644 --- a/src/core/CL/CLCommandBuffer.cpp +++ b/src/core/CL/CLCommandBuffer.cpp @@ -38,7 +38,7 @@ std::unique_ptr<CLCommandBuffer> CLCommandBuffer::create(cl_command_queue queue) const auto &cl_device = CLKernelLibrary::get().get_device(); const auto has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device); - if(has_mutable_dispatch) + if (has_mutable_dispatch) { return std::make_unique<CLMutableCommandBuffer>(queue); } diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h index 8a94e389fa..90e434161e 100644 --- a/src/core/CL/CLCommandBuffer.h +++ b/src/core/CL/CLCommandBuffer.h @@ -87,7 +87,8 @@ public: * @param[in] global The global work size. * @param[in] local The local work size. */ - virtual void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0; + virtual void + add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0; /** Add the mutable argument to the current kernel enqueue command. * @@ -154,7 +155,7 @@ protected: CLCommandBuffer &state(State state); private: - State _state{ State::Created }; + State _state{State::Created}; }; } // namespace arm_compute diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp index f1a902c7b9..242fd7719c 100644 --- a/src/core/CL/CLCompatCommandBuffer.cpp +++ b/src/core/CL/CLCompatCommandBuffer.cpp @@ -31,8 +31,7 @@ namespace arm_compute { -CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) - : _queue(queue) +CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue) { } @@ -40,11 +39,14 @@ CLCompatCommandBuffer::~CLCompatCommandBuffer() { } -void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) +void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) { ARM_COMPUTE_ERROR_ON(state() != State::Created); - _kernel_cmds.push_back(KernelCommand{ kernel, offset, global, local, {} }); + _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}}); } void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) @@ -52,7 +54,7 @@ void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const ARM_COMPUTE_ERROR_ON(state() != State::Created); ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty()); - _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{ arg_idx, size, value }); + _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value}); } void CLCompatCommandBuffer::finalize() @@ -61,7 +63,7 @@ void CLCompatCommandBuffer::finalize() _kernel_cmds.shrink_to_fit(); - for(auto &cmd : _kernel_cmds) + for (auto &cmd : _kernel_cmds) { cmd.mutable_args.shrink_to_fit(); } @@ -80,25 +82,19 @@ void CLCompatCommandBuffer::enqueue() { ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - for(const auto &cmd : _kernel_cmds) + for (const auto &cmd : _kernel_cmds) { - for(const auto &arg : cmd.mutable_args) + for (const auto &arg : cmd.mutable_args) { const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value); handle_cl_error("clSetKernelArg", error); } - const auto error = clEnqueueNDRangeKernel( - _queue, - cmd.kernel, - static_cast<cl_uint>(cmd.global.dimensions()), - cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, - cmd.global.get(), - cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, - 0, - nullptr, - nullptr); + const auto error = + clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast<cl_uint>(cmd.global.dimensions()), + cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(), + cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr); handle_cl_error("clEnqueueNDRangeKernel", error); } diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h index e91d52d2d6..d5df106425 100644 --- a/src/core/CL/CLCompatCommandBuffer.h +++ b/src/core/CL/CLCompatCommandBuffer.h @@ -57,7 +57,10 @@ public: /** Disallow move assignment. */ CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete; - void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override; + void add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) override; void finalize() override; diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp index 2d024f9c2f..9bbc32657e 100644 --- a/src/core/CL/CLCompileContext.cpp +++ b/src/core/CL/CLCompileContext.cpp @@ -22,19 +22,19 @@ * SOFTWARE. */ #include "arm_compute/core/CL/CLCompileContext.h" -#include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Utils.h" + #include "support/StringSupport.h" #include <regex> namespace arm_compute { -CLBuildOptions::CLBuildOptions() - : _build_opts() +CLBuildOptions::CLBuildOptions() : _build_opts() { } @@ -45,7 +45,7 @@ void CLBuildOptions::add_option(std::string option) void CLBuildOptions::add_option_if(bool cond, std::string option) { - if(cond) + if (cond) { add_option(std::move(option)); } @@ -63,7 +63,7 @@ void CLBuildOptions::add_options(const StringSet &options) void CLBuildOptions::add_options_if(bool cond, const StringSet &options) { - if(cond) + if (cond) { add_options(options); } @@ -79,26 +79,35 @@ bool CLBuildOptions::operator==(const CLBuildOptions &other) const return _build_opts == other._build_opts; } -Program::Program() - : _context(), _device(), _is_binary(false), _name(), _source(), _binary() +Program::Program() : _context(), _device(), _is_binary(false), _name(), _source(), _binary() { } Program::Program(cl::Context context, std::string name, std::string source) - : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary() + : _context(std::move(context)), + _device(), + _is_binary(false), + _name(std::move(name)), + _source(std::move(source)), + _binary() { } Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary) - : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary)) + : _context(std::move(context)), + _device(std::move(device)), + _is_binary(true), + _name(std::move(name)), + _source(), + _binary(std::move(binary)) { } Program::operator cl::Program() const { - if(_is_binary) + if (_is_binary) { - return cl::Program(_context, { _device }, { _binary }); + return cl::Program(_context, {_device}, {_binary}); } else { @@ -112,12 +121,12 @@ bool Program::build(const cl::Program &program, const std::string &build_options { return program.build(build_options.c_str()) == CL_SUCCESS; } - catch(const cl::Error &e) + catch (const cl::Error &e) { cl_int err = CL_SUCCESS; const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err); - for(auto &pair : build_info) + for (auto &pair : build_info) { std::cerr << pair.second << std::endl; } @@ -133,14 +142,12 @@ cl::Program Program::build(const std::string &build_options) const return cl_program; } -Kernel::Kernel() - : _name(), _kernel() +Kernel::Kernel() : _name(), _kernel() { } Kernel::Kernel(std::string name, const cl::Program &program) - : _name(std::move(name)), - _kernel(cl::Kernel(program, _name.c_str())) + : _name(std::move(name)), _kernel(cl::Kernel(program, _name.c_str())) { } CLCompileContext::CLCompileContext() @@ -156,15 +163,19 @@ CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device _is_wbsm_supported = get_wbsm_support_info(device); } -Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source, - const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const +Kernel CLCompileContext::create_kernel(const std::string &kernel_name, + const std::string &program_name, + const std::string &program_source, + const std::string &kernel_path, + const StringSet &build_options_set, + bool is_binary) const { const std::string build_options = generate_build_options(build_options_set, kernel_path); const std::string built_program_name = program_name + "_" + build_options; auto built_program_it = _built_programs_map.find(built_program_name); cl::Program cl_program; - if(_built_programs_map.end() != built_program_it) + if (_built_programs_map.end() != built_program_it) { // If program has been built, retrieve to create kernel from it cl_program = built_program_it->second; @@ -184,11 +195,12 @@ Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std return Kernel(kernel_name, cl_program); } -const Program &CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const +const Program & +CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const { const auto program_it = _programs_map.find(program_name); - if(program_it != _programs_map.end()) + if (program_it != _programs_map.end()) { return program_it->second; } @@ -199,9 +211,10 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c ARM_COMPUTE_UNUSED(is_binary); program = Program(_context, program_name, program_source); #else /* EMBEDDED_KERNELS */ - if(is_binary) + if (is_binary) { - program = Program(_context, _device.cl_device(), program_name, std::vector<unsigned char>(program_source.begin(), program_source.end())); + program = Program(_context, _device.cl_device(), program_name, + std::vector<unsigned char>(program_source.begin(), program_source.end())); } else { @@ -218,18 +231,19 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c void CLCompileContext::set_context(cl::Context context) { _context = std::move(context); - if(_context.get() != nullptr) + if (_context.get() != nullptr) { const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>(); - if(!cl_devices.empty()) + if (!cl_devices.empty()) { _device = CLDevice(cl_devices[0]); } } } -std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const +std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, + const std::string &kernel_path) const { std::string concat_str; bool ext_supported = false; @@ -241,27 +255,27 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) GPUTarget gpu_arch = get_arch_from_target(_device.target()); - concat_str += " -DGPU_ARCH=" + support::cpp11::to_string( - static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch)); + concat_str += + " -DGPU_ARCH=" + support::cpp11::to_string(static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch)); - if(_device.supported("cl_khr_fp16")) + if (_device.supported("cl_khr_fp16")) { concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; } - if(_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product")) + if (_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product")) { concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ENABLED=1 "; } - if(_device.supported("cl_arm_integer_dot_product_accumulate_int8")) + if (_device.supported("cl_arm_integer_dot_product_accumulate_int8")) { concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED=1 "; } std::tie(ext_supported, ext_buildopts) = _device.is_non_uniform_workgroup_supported(); - if(ext_supported) + if (ext_supported) { concat_str += ext_buildopts; } @@ -270,7 +284,7 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); } - if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11) + if (gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11) { concat_str += " -DUNROLL_WITH_PRAGMA "; } @@ -295,7 +309,7 @@ std::string CLCompileContext::stringify_set(const StringSet &s, const std::strin #endif /* EMBEDDED_KERNELS */ // Concatenate set - for(const auto &el : s) + for (const auto &el : s) { concat_set += " " + el; } @@ -340,7 +354,7 @@ cl::NDRange CLCompileContext::default_ndrange() const GPUTarget _target = get_target_from_device(_device.cl_device()); cl::NDRange default_range; - switch(_target) + switch (_target) { case GPUTarget::MIDGARD: case GPUTarget::T600: @@ -370,7 +384,8 @@ size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) cons size_t result; size_t err = kernel.getWorkGroupInfo(_device.cl_device(), CL_KERNEL_WORK_GROUP_SIZE, &result); - ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_ERROR_ON_MSG(err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); ARM_COMPUTE_UNUSED(err); return result; @@ -392,7 +407,7 @@ int32_t CLCompileContext::get_ddk_version() const const std::regex ddk_regex("r([0-9]*)p[0-9]"); std::smatch ddk_match; - if(std::regex_search(device_version, ddk_match, ddk_regex)) + if (std::regex_search(device_version, ddk_match, ddk_regex)) { return std::stoi(ddk_match[1]); } diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 78f36100d5..5ea99d360a 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -22,14 +22,15 @@ * SOFTWARE. */ #include "arm_compute/core/CL/CLHelpers.h" + #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLTypes.h" -#include "arm_compute/core/utils/DataTypeUtils.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Log.h" #include "arm_compute/core/Types.h" -#include "src/gpu/cl/ClCompileContext.h" +#include "arm_compute/core/utils/DataTypeUtils.h" +#include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/ClKernelLibrary.h" #include <utility> @@ -39,7 +40,7 @@ namespace arm_compute { std::string get_cl_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -75,7 +76,7 @@ std::string get_cl_type_from_data_type(const DataType &dt) std::string get_cl_promoted_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -105,7 +106,7 @@ std::string get_cl_promoted_type_from_data_type(const DataType &dt) std::string get_cl_unsigned_type_from_element_size(size_t element_size) { - switch(element_size) + switch (element_size) { case 1: return "uchar"; @@ -123,7 +124,7 @@ std::string get_cl_unsigned_type_from_element_size(size_t element_size) std::string get_cl_signed_type_from_element_size(size_t element_size) { - switch(element_size) + switch (element_size) { case 1: return "char"; @@ -141,7 +142,7 @@ std::string get_cl_signed_type_from_element_size(size_t element_size) std::string get_cl_select_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -174,7 +175,7 @@ std::string get_cl_select_type_from_data_type(const DataType &dt) std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -192,7 +193,7 @@ std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt) std::string get_data_size_from_data_type(const DataType &dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::S8: @@ -244,8 +245,9 @@ bool dot8_supported(const cl::Device &device) const GPUTarget gpu_target = get_target_from_name(device_name); // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8 - std::set<GPUTarget> sw_workaround_issue = { GPUTarget::G76 }; - return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0); + std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76}; + return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || + sw_workaround_issue.count(gpu_target) != 0); } bool dot8_acc_supported(const cl::Device &device) @@ -256,23 +258,23 @@ bool dot8_acc_supported(const cl::Device &device) CLVersion get_cl_version(const cl::Device &device) { std::string version_str = device.getInfo<CL_DEVICE_VERSION>(); - if(version_str.find("OpenCL 3") != std::string::npos) + if (version_str.find("OpenCL 3") != std::string::npos) { return CLVersion::CL30; } - else if(version_str.find("OpenCL 2") != std::string::npos) + else if (version_str.find("OpenCL 2") != std::string::npos) { return CLVersion::CL20; } - else if(version_str.find("OpenCL 1.2") != std::string::npos) + else if (version_str.find("OpenCL 1.2") != std::string::npos) { return CLVersion::CL12; } - else if(version_str.find("OpenCL 1.1") != std::string::npos) + else if (version_str.find("OpenCL 1.1") != std::string::npos) { return CLVersion::CL11; } - else if(version_str.find("OpenCL 1.0") != std::string::npos) + else if (version_str.find("OpenCL 1.0") != std::string::npos) { return CLVersion::CL10; } @@ -287,14 +289,15 @@ bool device_supports_extension(const cl::Device &device, const char *extension_n return (pos != std::string::npos); } -bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout) +bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, + const Size2D &kernel_size, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN); using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; - std::vector<WinogradConfiguration> winograd_configs_nchw = - { + std::vector<WinogradConfiguration> winograd_configs_nchw = { WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)), WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)), WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1)), @@ -303,11 +306,9 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)), WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)), WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)), - WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5)) - }; + WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))}; - std::vector<WinogradConfiguration> winograd_configs_nhwc = - { + std::vector<WinogradConfiguration> winograd_configs_nhwc = { WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)), WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)), WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)), @@ -324,19 +325,21 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si std::pair<int, int>(kernel_size.width, kernel_size.height)); // Return true if supported - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end()); + return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != + winograd_configs_nchw.end()); } else { - return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end()); + return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != + winograd_configs_nhwc.end()); } } size_t preferred_vector_width(const cl::Device &device, const DataType dt) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::S8: @@ -382,7 +385,7 @@ size_t get_cl_image_pitch_alignment(const cl::Device &device) cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr); - if(err == CL_SUCCESS) + if (err == CL_SUCCESS) { return pixel_aligment; } @@ -396,12 +399,14 @@ bool get_cl_non_uniform_work_group_supported(const cl::Device &device) { cl_bool supported = CL_FALSE; - cl_int err = clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr); + cl_int err = + clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr); return (err == CL_SUCCESS && supported == CL_TRUE); } -cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts) +cl::Kernel +create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts) { opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); @@ -409,7 +414,8 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_ auto kernel_src = klib.program(program_name); const std::string kernel_path = klib.kernel_path(); - return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, build_opts, kernel_src.is_binary)); + return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, + build_opts, kernel_src.is_binary)); } cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size) @@ -423,8 +429,9 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio bool get_wbsm_support_info(const cl::Device &device) { cl_bitfield capabilities = 0; - cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr); - if((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM)) + cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), + &capabilities, nullptr); + if ((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM)) { return true; } @@ -433,35 +440,33 @@ bool get_wbsm_support_info(const cl::Device &device) void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint) { - cl_int err = clSetKernelExecInfo(kernel.get(), - CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, - sizeof(cl_int), - &wbsm_hint); + cl_int err = clSetKernelExecInfo(kernel.get(), CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, + sizeof(cl_int), &wbsm_hint); ARM_COMPUTE_UNUSED(err); ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS); } bool export_to_cl_image(const ITensorInfo *tensor) { - if(tensor->tensor_shape()[0] % 4 != 0) + if (tensor->tensor_shape()[0] % 4 != 0) { return false; } // If not floating point - if(!is_data_type_float(tensor->data_type())) + if (!is_data_type_float(tensor->data_type())) { return false; } // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform - if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + if (!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) { return false; } // Check cl image pitch alignment - if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) + if (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) { return false; } @@ -471,7 +476,7 @@ bool export_to_cl_image(const ITensorInfo *tensor) const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>(); const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>(); - if(image_w > max_image_w || image_h > max_image_h) + if (image_w > max_image_w || image_h > max_image_h) { return false; } @@ -481,9 +486,9 @@ bool export_to_cl_image(const ITensorInfo *tensor) void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values) { - for(const int value : values) + for (const int value : values) { - if(value > max_manual_loop_unrolling) + if (value > max_manual_loop_unrolling) { built_opts.add_option("-DUNROLL_WITH_PRAGMA"); return; diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index c5a0796c3a..e69d006750 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -22,8 +22,11 @@ * SOFTWARE. */ #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "arm_compute/core/Error.h" + #include "src/gpu/cl/ClKernelLibrary.h" + #include <algorithm> #include <array> #include <fstream> @@ -31,8 +34,7 @@ #include <vector> namespace arm_compute { -CLKernelLibrary::CLKernelLibrary() - : _compile_context() +CLKernelLibrary::CLKernelLibrary() : _compile_context() { opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built } @@ -41,13 +43,15 @@ CLKernelLibrary &CLKernelLibrary::get() static CLKernelLibrary _kernel_library; return _kernel_library; } -Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const std::set<std::string> &build_options_set) const +Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, + const std::set<std::string> &build_options_set) const { const opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); const std::string program_name = klib.program_name(kernel_name); auto program = klib.program(program_name); const std::string &kernel_path = CLKernelLibrary::get().get_kernel_path(); - return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, program.is_binary); + return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, + program.is_binary); } std::string CLKernelLibrary::get_program_name(const std::string &kernel_name) const { @@ -131,4 +135,4 @@ CLCompileContext &CLKernelLibrary::get_compile_context() { return _compile_context; } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp index b9c59ac6f0..05b351fc25 100644 --- a/src/core/CL/CLMutableCommandBuffer.cpp +++ b/src/core/CL/CLMutableCommandBuffer.cpp @@ -31,8 +31,7 @@ namespace arm_compute { -CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) - : CLCommandBuffer() +CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer() { cl_int status = CL_SUCCESS; @@ -52,7 +51,10 @@ CLMutableCommandBuffer::~CLMutableCommandBuffer() handle_cl_error("clReleaseCommandBufferKHR", status); } -void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) +void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) { ARM_COMPUTE_ERROR_ON(state() != State::Created); @@ -65,18 +67,8 @@ void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &off }; const auto error = clCommandNDRangeKernelKHR( - _cb, - nullptr, - properties, - kernel, - global.dimensions(), - offset.dimensions() != 0 ? offset.get() : nullptr, - global.get(), - local.dimensions() != 0 ? local.get() : nullptr, - 0, - nullptr, - nullptr, - &mutable_handle); + _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr, + global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle); handle_cl_error("clCommandNDRangeKernelKHR", error); @@ -114,7 +106,7 @@ void CLMutableCommandBuffer::finalize() size_t arg_no = 0; - for(auto &mut_dispatch_cfg : _mut_dispatch_cfgs) + for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs) { ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size()); mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no]; @@ -132,9 +124,7 @@ void CLMutableCommandBuffer::update() { ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - const auto error = clUpdateMutableCommandsKHR( - _cb, - &_mut_cfg); + const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg); handle_cl_error("clUpdateMutableCommandsKHR", error); } @@ -143,13 +133,7 @@ void CLMutableCommandBuffer::enqueue() { ARM_COMPUTE_ERROR_ON(state() != State::Finalized); - const auto error = clEnqueueCommandBufferKHR( - 0, - nullptr, - _cb, - 0, - nullptr, - nullptr); + const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr); handle_cl_error("clEnqueueCommandBufferKHR", error); } diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h index 04e94b0bb2..8997d7d1fd 100644 --- a/src/core/CL/CLMutableCommandBuffer.h +++ b/src/core/CL/CLMutableCommandBuffer.h @@ -57,7 +57,10 @@ public: /** Disallow move assignment. */ CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete; - void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override; + void add_kernel(cl_kernel kernel, + const cl::NDRange &offset, + const cl::NDRange &global, + const cl::NDRange &local) override; void finalize() override; diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp index 289300b3a1..290ed32648 100644 --- a/src/core/CL/CLUtils.cpp +++ b/src/core/CL/CLUtils.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "support/StringSupport.h" namespace arm_compute @@ -38,15 +39,15 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); const cl::Context &ctx = CLKernelLibrary::get().context(); - const cl::Buffer &buffer = tensor->cl_buffer(); + const cl::Buffer &buffer = tensor->cl_buffer(); const ITensorInfo *info = tensor->info(); - ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), - "Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement"); + ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), "Tensor paddings must not be locked to allow extending paddings to " + "satisfy cl_image pitch alignment requirement"); - const size_t image_w{ info->dimension(0) / 4 }; - const size_t image_h{ info->tensor_shape().total_size() / info->dimension(0) }; - const size_t max_image_w{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>() }; - const size_t max_image_h{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() }; + const size_t image_w{info->dimension(0) / 4}; + const size_t image_h{info->tensor_shape().total_size() / info->dimension(0)}; + const size_t max_image_w{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>()}; + const size_t max_image_h{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>()}; ARM_COMPUTE_UNUSED(max_image_w, max_image_h); ARM_COMPUTE_ERROR_ON_MSG(image_w > max_image_w, "Image width exceeds maximum width for exporting to cl_image"); @@ -58,18 +59,22 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im return create_image2d_from_buffer(ctx, buffer, shape2d, info->data_type(), image_row_pitch, image_type); } -cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type) +cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, + const cl::Buffer &buffer, + const TensorShape &shape2d, + DataType data_type, + size_t image_row_pitch, + CLImage2DType image_type) { ARM_COMPUTE_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); ARM_COMPUTE_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); - ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, - "Cannot create cl_image from empty cl_buffer"); + ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, "Cannot create cl_image from empty cl_buffer"); cl_channel_type cl_data_type; - switch(data_type) + switch (data_type) { case DataType::F32: cl_data_type = CL_FLOAT; @@ -84,7 +89,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer cl_mem cl_image; cl_int err = CL_SUCCESS; - const cl_image_format format = { CL_RGBA, cl_data_type }; + const cl_image_format format = {CL_RGBA, cl_data_type}; cl_image_desc desc; memset(&desc, 0, sizeof(desc)); @@ -94,7 +99,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer desc.image_width = shape2d[0]; desc.image_height = shape2d[1]; - switch(image_type) + switch (image_type) { case CLImage2DType::ReadOnly: cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err); @@ -114,7 +119,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer void handle_cl_error(const std::string &function_name, cl_int error_code) { - if(error_code != CL_SUCCESS) + if (error_code != CL_SUCCESS) { std::string error_message = function_name + " - Error code: " + std::to_string(error_code); ARM_COMPUTE_ERROR(error_message.c_str()); diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h index de9c1b3194..f9dcfeac3a 100644 --- a/src/core/CL/CLUtils.h +++ b/src/core/CL/CLUtils.h @@ -72,7 +72,12 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im * * @return cl::Image2D object */ -cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type); +cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, + const cl::Buffer &buffer, + const TensorShape &shape2d, + DataType data_type, + size_t image_row_pitch, + CLImage2DType image_type); /** Check for CL error code and throw exception accordingly. * diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h index 7b5294e452..50d224f1c0 100644 --- a/src/core/CL/CLValidate.h +++ b/src/core/CL/CLValidate.h @@ -29,11 +29,13 @@ namespace arm_compute { -#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \ - ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported())) +#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \ + CLKernelLibrary::get().fp16_supported())) -#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \ - ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported())) +#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \ + CLKernelLibrary::get().fp16_supported())) /** Return an error if int64_base_atomics extension is not supported by the device. * @@ -43,11 +45,13 @@ namespace arm_compute * * @return Status */ -inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line) +inline arm_compute::Status +error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line) { - if(!CLKernelLibrary::get().int64_base_atomics_supported()) + if (!CLKernelLibrary::get().int64_base_atomics_supported()) { - return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported"); + return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, + "Atomic functions are not supported"); } return arm_compute::Status{}; } diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp index a53fdbbab6..f96b24d2a9 100644 --- a/src/core/CL/DefaultLWSHeuristics.cpp +++ b/src/core/CL/DefaultLWSHeuristics.cpp @@ -31,13 +31,13 @@ cl::NDRange get_gemm_lws(size_t gws_x, size_t gws_y, size_t gws_z) { ARM_COMPUTE_UNUSED(gws_y); - if(gws_z != 1) + if (gws_z != 1) { return cl::NDRange(4, 4, 2); } else { - if(gws_x > 256) + if (gws_x > 256) { return cl::NDRange(2, 16, 1); } @@ -59,9 +59,9 @@ cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z) { ARM_COMPUTE_UNUSED(gws_z); - if(gws_x < gws_y) + if (gws_x < gws_y) { - if(gws_x < 4) + if (gws_x < 4) { return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1); } @@ -81,7 +81,7 @@ cl::NDRange get_dwc_lws(size_t gws_x, size_t gws_y, size_t gws_z) ARM_COMPUTE_UNUSED(gws_y); ARM_COMPUTE_UNUSED(gws_z); - if(gws_x < 32) + if (gws_x < 32) { return cl::NDRange(gws_x, 4, 4); } @@ -100,7 +100,7 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws) const size_t gws_y = gws[1]; const size_t gws_z = gws[2]; - switch(kernel_type) + switch (kernel_type) { case CLKernelType::GEMM: { @@ -124,4 +124,4 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws) } } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index dc3a86a528..ac53e7f1d2 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -25,18 +25,23 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" + #include "src/core/helpers/Utils.h" #include <cstddef> -void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items) +void arm_compute::enqueue(cl::CommandQueue &queue, + ICLKernel &kernel, + const Window &window, + const cl::NDRange &lws_hint, + bool use_dummy_work_items) { - if(kernel.kernel()() == nullptr) + if (kernel.kernel()() == nullptr) { return; } - for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) + for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_ERROR_ON(window[i].step() == 0); // Make sure that dimensions > Z are 1 @@ -46,7 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items); // Check for empty NDRange - if(gws.dimensions() == 0) + if (gws.dimensions() == 0) { return; } @@ -54,7 +59,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind kernel.cache_gws(gws); cl::NDRange valid_lws; - if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) + if (lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) { valid_lws = cl::NullRange; } @@ -65,12 +70,12 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind cl::NDRange lws = cl::NullRange; - if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) + if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) { lws = valid_lws; } - if(CLKernelLibrary::get().is_wbsm_supported()) + if (CLKernelLibrary::get().is_wbsm_supported()) { set_wbsm(kernel.kernel(), kernel.wbsm_hint()); } @@ -90,7 +95,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons // Calculate offset to the start of the window unsigned int offset_first_element = info->offset_first_element_in_bytes(); - for(unsigned int n = 0; n < info->num_dimensions(); ++n) + for (unsigned int n = 0; n < info->num_dimensions(); ++n) { offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n]; } @@ -98,7 +103,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons unsigned int idx_start = idx; _kernel.setArg(idx++, tensor->cl_buffer()); - for(unsigned int d = 0; d < dimension_size; ++d) + for (unsigned int d = 0; d < dimension_size; ++d) { _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]); _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step())); @@ -107,7 +112,8 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons _kernel.setArg<cl_uint>(idx++, offset_first_element); ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor<dimension_size>() != idx, - "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>()); + "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", + dimension_size, num_arguments_per_tensor<dimension_size>()); ARM_COMPUTE_UNUSED(idx_start); } @@ -178,7 +184,7 @@ void ICLKernel::set_target(cl::Device &device) size_t ICLKernel::get_max_workgroup_size() { - if(_max_workgroup_size == 0) + if (_max_workgroup_size == 0) { _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel); } @@ -187,7 +193,7 @@ size_t ICLKernel::get_max_workgroup_size() cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items) { - if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) + if ((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) { return cl::NullRange; } @@ -196,7 +202,7 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work (window.y().end() - window.y().start()) / window.y().step(), (window.z().end() - window.z().start()) / window.z().step()); - if(use_dummy_work_items) + if (use_dummy_work_items) { gws.get()[0] = get_next_power_two(gws[0]); gws.get()[1] = get_next_power_two(gws[1]); diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index c82809cef3..6aebef15a5 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -27,10 +27,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLTypes.h" #include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/IKernel.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/CL/CLTuningParams.h" #include "src/core/CL/DefaultLWSHeuristics.h" @@ -43,14 +43,14 @@ namespace { bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1) { - if(lws0.dimensions() != lws1.dimensions()) + if (lws0.dimensions() != lws1.dimensions()) { return false; } - for(size_t i = 0; i < lws0.dimensions(); ++i) + for (size_t i = 0; i < lws0.dimensions(); ++i) { - if(lws0.get()[i] != lws1.get()[i]) + if (lws0.get()[i] != lws1.get()[i]) { return false; } @@ -71,7 +71,7 @@ private: * * @return The number of arguments enqueued per array object. */ - template <unsigned int dimension_size> + template <unsigned int dimension_size> constexpr static unsigned int num_arguments_per_array() { return num_arguments_per_tensor<dimension_size>(); @@ -80,7 +80,7 @@ private: * * @return The number of arguments enqueued per tensor object. */ - template <unsigned int dimension_size> + template <unsigned int dimension_size> constexpr static unsigned int num_arguments_per_tensor() { return 2 + 2 * dimension_size; @@ -116,11 +116,13 @@ protected: * @param[in] window The maximum window which will be returned by window() * @param[in] tuning_params_hint (Optional) Tuning parameters to use. */ - void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0)) + void configure_internal(const Window &window, + CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), + 0)) { _tuning_params_hint = tuning_params_hint; - if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange())) + if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange())) { // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which // will be recalculated with use_dummy_work_items flag at run time again anyway. @@ -133,7 +135,13 @@ protected: public: /** Constructor */ ICLKernel() - : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange) + : _kernel(nullptr), + _target(GPUTarget::MIDGARD), + _config_id(arm_compute::default_config_id), + _max_workgroup_size(0), + _type(CLKernelType::UNKNOWN), + _tuning_params_hint(), + _cached_gws(cl::NullRange) { } /** Returns a reference to the OpenCL kernel of this object. @@ -161,7 +169,11 @@ public: * @param[in] window Window the kernel will be executed on. */ template <typename T> - void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window) + void add_1D_array_argument(unsigned int &idx, + const ICLArray<T> *array, + const Strides &strides, + unsigned int num_dimensions, + const Window &window) { add_array_argument<T, 1>(idx, array, strides, num_dimensions, window); } @@ -184,7 +196,7 @@ public: */ void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window) { - if(cond) + if (cond) { add_1D_tensor_argument(idx, tensor, window); } @@ -208,7 +220,7 @@ public: */ void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window) { - if(cond) + if (cond) { add_2D_tensor_argument(idx, tensor, window); } @@ -469,7 +481,11 @@ private: * @param[in] window Window the kernel will be executed on. */ template <typename T, unsigned int dimension_size> - void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window); + void add_array_argument(unsigned int &idx, + const ICLArray<T> *array, + const Strides &strides, + unsigned int num_dimensions, + const Window &window); /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. @@ -505,7 +521,11 @@ private: * * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed. */ -void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false); +void enqueue(cl::CommandQueue &queue, + ICLKernel &kernel, + const Window &window, + const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), + bool use_dummy_work_items = false); /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. * @@ -516,14 +536,15 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c * @param[in] window Window the kernel will be executed on. */ template <typename T, unsigned int dimension_size> -void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window) +void ICLKernel::add_array_argument( + unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window) { ARM_COMPUTE_ERROR_ON(array == nullptr); // Calculate offset to the start of the window unsigned int offset_first_element = 0; - for(unsigned int n = 0; n < num_dimensions; ++n) + for (unsigned int n = 0; n < num_dimensions; ++n) { offset_first_element += window[n].start() * strides[n]; } @@ -531,7 +552,7 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons unsigned int idx_start = idx; _kernel.setArg(idx++, array->cl_buffer()); - for(unsigned int dimension = 0; dimension < dimension_size; dimension++) + for (unsigned int dimension = 0; dimension < dimension_size; dimension++) { _kernel.setArg<cl_uint>(idx++, strides[dimension]); _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step()); @@ -540,8 +561,9 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons _kernel.setArg<cl_uint>(idx++, offset_first_element); ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx, - "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>()); + "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", + dimension_size, num_arguments_per_array<dimension_size>()); ARM_COMPUTE_UNUSED(idx_start); } -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLKERNEL_H */ diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp index 5d8295bdfe..3f7edbb88d 100644 --- a/src/core/CL/ICLSimple2DKernel.cpp +++ b/src/core/CL/ICLSimple2DKernel.cpp @@ -40,6 +40,5 @@ void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue) add_2D_tensor_argument(idx, _input, slice); add_2D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } diff --git a/src/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h index 5246492401..97bc1e58c2 100644 --- a/src/core/CL/ICLSimple2DKernel.h +++ b/src/core/CL/ICLSimple2DKernel.h @@ -37,5 +37,5 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */ diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp index fef1a86125..71d7d1f07b 100644 --- a/src/core/CL/ICLSimple3DKernel.cpp +++ b/src/core/CL/ICLSimple3DKernel.cpp @@ -42,6 +42,5 @@ void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h index ff0b274663..5071b6b339 100644 --- a/src/core/CL/ICLSimple3DKernel.h +++ b/src/core/CL/ICLSimple3DKernel.h @@ -39,5 +39,5 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */ diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp index d67fefdf71..c31db8355f 100644 --- a/src/core/CL/ICLSimpleKernel.cpp +++ b/src/core/CL/ICLSimpleKernel.cpp @@ -22,30 +22,35 @@ * SOFTWARE. */ #include "src/core/CL/ICLSimpleKernel.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/WindowHelpers.h" using namespace arm_compute; -ICLSimpleKernel::ICLSimpleKernel() - : _input(nullptr), _output(nullptr) +ICLSimpleKernel::ICLSimpleKernel() : _input(nullptr), _output(nullptr) { } -void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size) +void ICLSimpleKernel::configure(const ICLTensor *input, + ICLTensor *output, + unsigned int num_elems_processed_per_iteration, + bool border_undefined, + const BorderSize &border_size) { _input = input; _output = output; // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size); + Window win = + calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access); output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size); diff --git a/src/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h index b35547a217..6afd7309aa 100644 --- a/src/core/CL/ICLSimpleKernel.h +++ b/src/core/CL/ICLSimpleKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -55,12 +56,16 @@ public: * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. * @param[in] border_size (Optional) Size of the border. */ - void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize()); + void configure(const ICLTensor *input, + ICLTensor *output, + unsigned int num_elems_processed_per_iteration, + bool border_undefined = false, + const BorderSize &border_size = BorderSize()); protected: const ICLTensor *_input; ICLTensor *_output; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */ diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp index b541bff04a..0771db7f50 100644 --- a/src/core/CL/ICLTensor.cpp +++ b/src/core/CL/ICLTensor.cpp @@ -27,8 +27,7 @@ using namespace arm_compute; -ICLTensor::ICLTensor() - : _mapping(nullptr) +ICLTensor::ICLTensor() : _mapping(nullptr) { } diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index b092dfb4e2..35421d025e 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -36,11 +36,7 @@ namespace arm_compute { -CLSymbols::CLSymbols() noexcept(false) - : _loaded( -{ - false, false -}) +CLSymbols::CLSymbols() noexcept(false) : _loaded({false, false}) { } @@ -52,9 +48,9 @@ CLSymbols &CLSymbols::get() bool CLSymbols::load_default() { - static const std::vector<std::string> libraries_filenames{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" }; + static const std::vector<std::string> libraries_filenames{"libOpenCL.so", "libGLES_mali.so", "libmali.so"}; - if(_loaded.first) + if (_loaded.first) { return _loaded.second; } @@ -62,34 +58,32 @@ bool CLSymbols::load_default() // Indicate that default loading has been tried _loaded.first = true; - if(load(libraries_filenames, /* use_loader */ false)) + if (load(libraries_filenames, /* use_loader */ false)) { - ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from shared library"); + ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, + "Failed to load OpenCL symbols from shared library"); return true; } #ifdef __ANDROID__ // When running in NDK environment, the above libraries are not accessible. - static const std::vector<std::string> android_libraries_filenames{ "libOpenCL-pixel.so", "libOpenCL-car.so" }; + static const std::vector<std::string> android_libraries_filenames{"libOpenCL-pixel.so", "libOpenCL-car.so"}; - if(load(android_libraries_filenames, /* use_loader */ true)) + if (load(android_libraries_filenames, /* use_loader */ true)) { - ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from android shared library"); + ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, + "Failed to load OpenCL symbols from android shared library"); return true; } #endif // __ANDROID__ // If not returned till here then libraries not found std::stringstream ss; - std::for_each(libraries_filenames.begin(), libraries_filenames.end(), [&ss](const std::string & s) - { - ss << s << " "; - }); + std::for_each(libraries_filenames.begin(), libraries_filenames.end(), + [&ss](const std::string &s) { ss << s << " "; }); #ifdef __ANDROID__ - std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), [&ss](const std::string & s) - { - ss << s << " "; - }); + std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), + [&ss](const std::string &s) { ss << s << " "; }); #endif // __ANDROID__ std::cerr << "Couldn't find any of the following OpenCL library: " << ss.str() << std::endl; return false; @@ -99,15 +93,15 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u { void *handle = nullptr; unsigned int index = 0; - for(index = 0; index < libraries_filenames.size(); ++index) + for (index = 0; index < libraries_filenames.size(); ++index) { handle = dlopen(libraries_filenames[index].c_str(), RTLD_LAZY | RTLD_LOCAL); - if(handle != nullptr) + if (handle != nullptr) { break; } } - if(index == libraries_filenames.size()) + if (index == libraries_filenames.size()) { // Set status of loading to failed _loaded.second = false; @@ -115,22 +109,23 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u } #ifdef __ANDROID__ - typedef void* (*loadOpenCLPointer_t)(const char* name); + typedef void *(*loadOpenCLPointer_t)(const char *name); loadOpenCLPointer_t loadOpenCLPointer; - if (use_loader) { + if (use_loader) + { typedef void (*enableOpenCL_t)(); - enableOpenCL_t enableOpenCL = - reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL")); + enableOpenCL_t enableOpenCL = reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL")); enableOpenCL(); - loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>( - dlsym(handle, "loadOpenCLPointer")); - } else { + loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(dlsym(handle, "loadOpenCLPointer")); + } + else + { loadOpenCLPointer = nullptr; } -#define LOAD_FUNCTION_PTR(func_name, _handle) \ - func_name##_ptr = reinterpret_cast<decltype(func_name) *>( use_loader ? \ - loadOpenCLPointer(#func_name) : dlsym(handle, #func_name)); +#define LOAD_FUNCTION_PTR(func_name, _handle) \ + func_name##_ptr = reinterpret_cast<decltype(func_name) *>(use_loader ? loadOpenCLPointer(#func_name) \ + : dlsym(handle, #func_name)); #else /* __ANDROID__ */ (void)use_loader; // Avoid unused warning #define LOAD_FUNCTION_PTR(func_name, handle) \ @@ -234,12 +229,11 @@ bool opencl_is_available() } } // namespace arm_compute -cl_int clEnqueueMarker(cl_command_queue command_queue, - cl_event *event) +cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, event); } @@ -249,12 +243,11 @@ cl_int clEnqueueMarker(cl_command_queue command_queue, } } -cl_int clWaitForEvents(cl_uint num_events, - const cl_event *event_list) +cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_events, event_list); } @@ -264,12 +257,18 @@ cl_int clWaitForEvents(cl_uint num_events, } } -cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr, - size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) +cl_int clEnqueueSVMMap(cl_command_queue command_queue, + cl_bool blocking_map, + cl_map_flags flags, + void *svm_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event); } @@ -279,12 +278,15 @@ cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_ } } -cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event) +cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, + void *svm_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event); } @@ -298,7 +300,7 @@ void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, size, alignment); } @@ -312,7 +314,7 @@ void clSVMFree(cl_context context, void *svm_pointer) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSVMFree_ptr; - if(func != nullptr) + if (func != nullptr) { func(context, svm_pointer); } @@ -326,7 +328,7 @@ cl_int clGetContextInfo(cl_context context, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetContextInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, param_name, param_value_size, param_value, param_value_size_ret); } @@ -343,7 +345,7 @@ cl_command_queue clCreateCommandQueue(cl_context context, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateCommandQueue_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, device, properties, errcode_ret); } @@ -360,7 +362,7 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context c { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateCommandQueueWithProperties_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, device, properties, errcode_ret); } @@ -370,17 +372,16 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context c } } -cl_context clCreateContext( - const cl_context_properties *properties, - cl_uint num_devices, - const cl_device_id *devices, - void (*pfn_notify)(const char *, const void *, size_t, void *), - void *user_data, - cl_int *errcode_ret) +cl_context clCreateContext(const cl_context_properties *properties, + cl_uint num_devices, + const cl_device_id *devices, + void (*pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateContext_ptr; - if(func != nullptr) + if (func != nullptr) { return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret); } @@ -398,7 +399,7 @@ cl_context clCreateContextFromType(const cl_context_properties *properties, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateContextFromType_ptr; - if(func != nullptr) + if (func != nullptr) { return func(properties, device_type, pfn_notify, user_data, errcode_ret); } @@ -408,17 +409,16 @@ cl_context clCreateContextFromType(const cl_context_properties *properties, } } -cl_int clBuildProgram( - cl_program program, - cl_uint num_devices, - const cl_device_id *device_list, - const char *options, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) +cl_int clBuildProgram(cl_program program, + cl_uint num_devices, + const cl_device_id *device_list, + const char *options, + void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), + void *user_data) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clBuildProgram_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, num_devices, device_list, options, pfn_notify, user_data); } @@ -428,22 +428,22 @@ cl_int clBuildProgram( } } -cl_int clEnqueueNDRangeKernel( - cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t *global_work_offset, - const size_t *global_work_size, - const size_t *local_work_size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); + return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, + num_events_in_wait_list, event_wait_list, event); } else { @@ -451,15 +451,11 @@ cl_int clEnqueueNDRangeKernel( } } -cl_int clSetKernelArg( - cl_kernel kernel, - cl_uint arg_index, - size_t arg_size, - const void *arg_value) +cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSetKernelArg_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, arg_index, arg_size, arg_value); } @@ -473,7 +469,7 @@ cl_int clRetainMemObject(cl_mem memobj) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainMemObject_ptr; - if(func != nullptr) + if (func != nullptr) { return func(memobj); } @@ -487,7 +483,7 @@ cl_int clReleaseMemObject(cl_mem memobj) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseMemObject_ptr; - if(func != nullptr) + if (func != nullptr) { return func(memobj); } @@ -497,17 +493,16 @@ cl_int clReleaseMemObject(cl_mem memobj) } } -cl_int clEnqueueUnmapMemObject( - cl_command_queue command_queue, - cl_mem memobj, - void *mapped_ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, + cl_mem memobj, + void *mapped_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event); } @@ -521,7 +516,7 @@ cl_int clRetainCommandQueue(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainCommandQueue_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -535,7 +530,7 @@ cl_int clReleaseContext(cl_context context) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseContext_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context); } @@ -548,7 +543,7 @@ cl_int clReleaseEvent(cl_event event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseEvent_ptr; - if(func != nullptr) + if (func != nullptr) { return func(event); } @@ -558,22 +553,22 @@ cl_int clReleaseEvent(cl_event event) } } -cl_int clEnqueueWriteBuffer( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - size_t offset, - size_t size, - const void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + size_t offset, + size_t size, + const void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); + return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, + event); } else { @@ -581,22 +576,22 @@ cl_int clEnqueueWriteBuffer( } } -cl_int clEnqueueReadBuffer( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - size_t offset, - size_t size, - void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) +cl_int clEnqueueReadBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + size_t offset, + size_t size, + void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); + return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, + event); } else { @@ -604,17 +599,16 @@ cl_int clEnqueueReadBuffer( } } -cl_int clGetProgramBuildInfo( - cl_program program, - cl_device_id device, - cl_program_build_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetProgramBuildInfo(cl_program program, + cl_device_id device, + cl_program_build_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, device, param_name, param_value_size, param_value, param_value_size_ret); } @@ -628,7 +622,7 @@ cl_int clRetainProgram(cl_program program) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainProgram_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program); } @@ -638,27 +632,27 @@ cl_int clRetainProgram(cl_program program) } } -void *clEnqueueMapBuffer( - cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_map, - cl_map_flags map_flags, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event, - cl_int *errcode_ret) +void *clEnqueueMapBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + size_t offset, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret); + return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, + event_wait_list, event, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -670,7 +664,7 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -680,24 +674,23 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue) } } -cl_program clCreateProgramWithBinary( - cl_context context, - cl_uint num_devices, - const cl_device_id *device_list, - const size_t *lengths, - const unsigned char **binaries, - cl_int *binary_status, - cl_int *errcode_ret) +cl_program clCreateProgramWithBinary(cl_context context, + cl_uint num_devices, + const cl_device_id *device_list, + const size_t *lengths, + const unsigned char **binaries, + cl_int *binary_status, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -709,7 +702,7 @@ cl_int clRetainContext(cl_context context) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainContext_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context); } @@ -723,7 +716,7 @@ cl_int clReleaseProgram(cl_program program) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseProgram_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program); } @@ -737,7 +730,7 @@ cl_int clFlush(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clFlush_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -751,7 +744,7 @@ cl_int clFinish(cl_command_queue command_queue) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clFinish_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue); } @@ -761,16 +754,15 @@ cl_int clFinish(cl_command_queue command_queue) } } -cl_int clGetProgramInfo( - cl_program program, - cl_program_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetProgramInfo(cl_program program, + cl_program_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetProgramInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, param_name, param_value_size, param_value, param_value_size_ret); } @@ -780,20 +772,17 @@ cl_int clGetProgramInfo( } } -cl_kernel clCreateKernel( - cl_program program, - const char *kernel_name, - cl_int *errcode_ret) +cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateKernel_ptr; - if(func != nullptr) + if (func != nullptr) { return func(program, kernel_name, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -805,7 +794,7 @@ cl_int clRetainKernel(cl_kernel kernel) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainKernel_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel); } @@ -815,22 +804,17 @@ cl_int clRetainKernel(cl_kernel kernel) } } -cl_mem clCreateBuffer( - cl_context context, - cl_mem_flags flags, - size_t size, - void *host_ptr, - cl_int *errcode_ret) +cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateBuffer_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, size, host_ptr, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -839,21 +823,17 @@ cl_mem clCreateBuffer( } cl_program clCreateProgramWithSource( - cl_context context, - cl_uint count, - const char **strings, - const size_t *lengths, - cl_int *errcode_ret) + cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, count, strings, lengths, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -865,7 +845,7 @@ cl_int clReleaseKernel(cl_kernel kernel) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clReleaseKernel_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel); } @@ -878,12 +858,12 @@ cl_int clReleaseKernel(cl_kernel kernel) cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, - cl_device_id *devices, + cl_device_id *devices, cl_uint *num_devices) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetDeviceIDs_ptr; - if(func != nullptr) + if (func != nullptr) { return func(platform, device_type, num_entries, devices, num_devices); } @@ -901,7 +881,7 @@ cl_int clGetDeviceInfo(cl_device_id device, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetDeviceInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(device, param_name, param_value_size, param_value, param_value_size_ret); } @@ -911,15 +891,12 @@ cl_int clGetDeviceInfo(cl_device_id device, } } -cl_int clGetMemObjectInfo(cl_mem memobj, - cl_mem_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetMemObjectInfo( + cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(memobj, param_name, param_value_size, param_value, param_value_size_ret); } @@ -933,7 +910,7 @@ cl_int clRetainEvent(cl_event event) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clRetainEvent_ptr; - if(func != nullptr) + if (func != nullptr) { return func(event); } @@ -951,7 +928,7 @@ cl_int clGetPlatformInfo(cl_platform_id platform, { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetPlatformInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(platform, param_name, param_value_size, param_value, param_value_size_ret); } @@ -965,7 +942,7 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetPlatformIDs_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_entries, platforms, num_platforms); } @@ -975,17 +952,16 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint } } -cl_int -clGetKernelWorkGroupInfo(cl_kernel kernel, - cl_device_id device, - cl_kernel_work_group_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret); } @@ -995,16 +971,15 @@ clGetKernelWorkGroupInfo(cl_kernel kernel, } } -cl_int -clGetCommandQueueInfo(cl_command_queue command_queue, - cl_command_queue_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetCommandQueueInfo(cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret); } @@ -1014,16 +989,15 @@ clGetCommandQueueInfo(cl_command_queue command_queue, } } -cl_int -clGetKernelInfo(cl_kernel kernel, - cl_kernel_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetKernelInfo(cl_kernel kernel, + cl_kernel_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, param_name, param_value_size, param_value, param_value_size_ret); } @@ -1033,16 +1007,15 @@ clGetKernelInfo(cl_kernel kernel, } } -cl_int -clGetEventProfilingInfo(cl_event event, - cl_profiling_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) +cl_int clGetEventProfilingInfo(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(event, param_name, param_value_size, param_value, param_value_size_ret); } @@ -1052,23 +1025,22 @@ clGetEventProfilingInfo(cl_event event, } } -cl_mem -clCreateImage(cl_context context, - cl_mem_flags flags, - const cl_image_format *image_format, - const cl_image_desc *image_desc, - void *host_ptr, - cl_int *errcode_ret) +cl_mem clCreateImage(cl_context context, + cl_mem_flags flags, + const cl_image_format *image_format, + const cl_image_desc *image_desc, + void *host_ptr, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clCreateImage_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, image_format, image_desc, host_ptr, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } @@ -1076,14 +1048,12 @@ clCreateImage(cl_context context, } } -cl_int clSetKernelExecInfo(cl_kernel kernel, - cl_kernel_exec_info param_name, - size_t param_value_size, - const void *param_value) +cl_int +clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void *param_value) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr; - if(func != nullptr) + if (func != nullptr) { return func(kernel, param_name, param_value_size, param_value); } @@ -1093,22 +1063,21 @@ cl_int clSetKernelExecInfo(cl_kernel kernel, } } -cl_command_buffer_khr clCreateCommandBufferKHR( - cl_uint num_queues, - const cl_command_queue* queues, - const cl_command_buffer_properties_khr* properties, - cl_int* errcode_ret) +cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint num_queues, + const cl_command_queue *queues, + const cl_command_buffer_properties_khr *properties, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clCreateCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_queues, queues, properties, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_INVALID_OPERATION; } @@ -1122,7 +1091,7 @@ cl_int clFinalizeCommandBufferKHR(cl_command_buffer_khr command_buffer) arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clFinalizeCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer); } @@ -1137,7 +1106,7 @@ cl_int clRetainCommandBufferKHR(cl_command_buffer_khr command_buffer) arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clRetainCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer); } @@ -1152,7 +1121,7 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer) arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clReleaseCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer); } @@ -1162,18 +1131,17 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer) } } -cl_int clEnqueueCommandBufferKHR( - cl_uint num_queues, - cl_command_queue* queues, - cl_command_buffer_khr command_buffer, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event) +cl_int clEnqueueCommandBufferKHR(cl_uint num_queues, + cl_command_queue *queues, + cl_command_buffer_khr command_buffer, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clEnqueueCommandBufferKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(num_queues, queues, command_buffer, num_events_in_wait_list, event_wait_list, event); } @@ -1183,27 +1151,26 @@ cl_int clEnqueueCommandBufferKHR( } } - -cl_int clCommandNDRangeKernelKHR( - cl_command_buffer_khr command_buffer, - cl_command_queue command_queue, - const cl_ndrange_kernel_command_properties_khr* properties, - cl_kernel kernel, - cl_uint work_dim, - const size_t* global_work_offset, - const size_t* global_work_size, - const size_t* local_work_size, - cl_uint num_sync_points_in_wait_list, - const cl_sync_point_khr* sync_point_wait_list, - cl_sync_point_khr* sync_point, - cl_mutable_command_khr* mutable_handle) +cl_int clCommandNDRangeKernelKHR(cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + const cl_ndrange_kernel_command_properties_khr *properties, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr *sync_point_wait_list, + cl_sync_point_khr *sync_point, + cl_mutable_command_khr *mutable_handle) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clCommandNDRangeKernelKHR_ptr; - if(func != nullptr) + if (func != nullptr) { - return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle); + return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size, + local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle); } else { @@ -1211,14 +1178,13 @@ cl_int clCommandNDRangeKernelKHR( } } -cl_int clUpdateMutableCommandsKHR( - cl_command_buffer_khr command_buffer, - const cl_mutable_base_config_khr* mutable_config) +cl_int clUpdateMutableCommandsKHR(cl_command_buffer_khr command_buffer, + const cl_mutable_base_config_khr *mutable_config) { arm_compute::CLSymbols::get().load_default(); const auto func = arm_compute::CLSymbols::get().clUpdateMutableCommandsKHR_ptr; - if(func != nullptr) + if (func != nullptr) { return func(command_buffer, mutable_config); } @@ -1228,23 +1194,22 @@ cl_int clUpdateMutableCommandsKHR( } } -cl_mem -clImportMemoryARM(cl_context context, - cl_mem_flags flags, - const cl_import_properties_arm *properties, - void *memory, - size_t size, - cl_int *errcode_ret) +cl_mem clImportMemoryARM(cl_context context, + cl_mem_flags flags, + const cl_import_properties_arm *properties, + void *memory, + size_t size, + cl_int *errcode_ret) { arm_compute::CLSymbols::get().load_default(); auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr; - if(func != nullptr) + if (func != nullptr) { return func(context, flags, properties, memory, size, errcode_ret); } else { - if(errcode_ret != nullptr) + if (errcode_ret != nullptr) { *errcode_ret = CL_OUT_OF_RESOURCES; } diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h index 3f93c8d6fc..02faae2369 100644 --- a/src/core/CL/cl_kernels/activation_float_helpers.h +++ b/src/core/CL/cl_kernels/activation_float_helpers.h @@ -31,7 +31,8 @@ #endif // GPU_ARCH == GPU_ARCH_BIFROST // Hard-Swish -#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) +#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) // Logistic Activation #define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) @@ -49,13 +50,16 @@ #define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) // Leaky RELU Activation -#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) +#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) // Soft RELU Activation #define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) // ELU Activation -#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) +#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, \ + (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) // Absolute Activation #define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) @@ -70,7 +74,8 @@ #define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) // GELU Activation -#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) +#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \ + (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) // Identity Activation #define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h index c420578546..c758ff1278 100644 --- a/src/core/CL/cl_kernels/activation_quant_helpers.h +++ b/src/core/CL/cl_kernels/activation_quant_helpers.h @@ -60,17 +60,17 @@ inline TYPE identiy_op(TYPE x) } #define ACTIVATION_OP2(op, x) op##_op(x) -#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) +#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) #if defined(S1_VAL) && defined(S2_VAL) #if defined(O1_VAL) && defined(O2_VAL) #define PERFORM_ACTIVATION_QUANT(act, data) \ ({ \ data = ACTIVATION_OP(act, data); \ - \ + \ VEC_DATA_TYPE(float, VEC_SIZE) \ fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \ - \ + \ fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \ data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \ }) @@ -78,17 +78,14 @@ inline TYPE identiy_op(TYPE x) #define PERFORM_ACTIVATION_QUANT(act, data) \ ({ \ data = ACTIVATION_OP(act, data); \ - \ + \ VEC_DATA_TYPE(float, VEC_SIZE) \ fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \ - \ + \ fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL)); \ data = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \ }) #endif /* defined(O1_VAL) && defined(O2_VAL) */ #else /* defined(S1_VAL) && defined(S2_VAL) */ -#define PERFORM_ACTIVATION_QUANT(act, data) \ - ({ \ - data = ACTIVATION_OP(act, data); \ - }) +#define PERFORM_ACTIVATION_QUANT(act, data) ({ data = ACTIVATION_OP(act, data); }) #endif /* defined(S1_VAL) && defined(S2_VAL) */ diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h index 0e938cb668..4bef02314f 100644 --- a/src/core/CL/cl_kernels/gemm_helpers.h +++ b/src/core/CL/cl_kernels/gemm_helpers.h @@ -34,14 +34,14 @@ * */ #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) -#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) +#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) // offset == 0 -#define scalar_access_0_1(x) ((x).s0) -#define scalar_access_0_2(x) ((x).s01) -#define scalar_access_0_3(x) ((x).s012) -#define scalar_access_0_4(x) ((x).s0123) -#define scalar_access_0_8(x) ((x).s01234567) +#define scalar_access_0_1(x) ((x).s0) +#define scalar_access_0_2(x) ((x).s01) +#define scalar_access_0_3(x) ((x).s012) +#define scalar_access_0_4(x) ((x).s0123) +#define scalar_access_0_8(x) ((x).s01234567) #define scalar_access_0_16(x) ((x).s0123456789ABCDEF) // offset == 1 @@ -100,8 +100,7 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ - ({}) +#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) ({}) #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); @@ -186,8 +185,10 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) -#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) +#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ + LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) +#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ + LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) /** @} */ // end of group LOAD_TENSOR /** Load 2D tensor (consecutive rows and columns) with Z offset. @@ -202,8 +203,7 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ - ({}) +#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) ({}) #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); @@ -279,8 +279,10 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** @}*/ // end of group LOAD_TENSOR_M0XN0 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). @@ -395,8 +397,10 @@ * @param[in] Z The z-axis offset vector * @{ */ -#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) -#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) /** @} */ // end of group LOAD_BLOCK /** Partially load the 0 to (n-1)th rows of the given variables @@ -517,8 +521,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) -#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) /** Load a block that can be partial in both x and y dimensions * * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty. @@ -541,22 +547,23 @@ * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0. * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ - { \ - LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else \ - { \ - LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ +#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ + { \ + LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else \ + { \ + LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ } /** Load a block that can only be partial in x but not y. * @@ -578,14 +585,15 @@ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else \ - { \ - LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ +#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, \ + PARTIAL_COND_X) \ + if (!(PARTIAL_COND_X)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else \ + { \ + LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ } /** Load a block that can only be partial in y but not x. * @@ -607,14 +615,15 @@ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. */ -#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ - if(!(PARTIAL_COND_Y)) \ - { \ - LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ - } \ - else \ - { \ - LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ +#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_COND_Y) \ + if (!(PARTIAL_COND_Y)) \ + { \ + LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ + } \ + else \ + { \ + LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ } /** @} */ // end of group LOAD_BLOCK_PARTIAL /** Boundary-aware GeMM block load @@ -676,28 +685,33 @@ */ #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case1: No partial blocks in either x or y -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 // Case2: Partial blocks in y -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 // Case3: Partial blocks in x -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case4: Partial blocks in both x and y -#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ - LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) +#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ + LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) -#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 +#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 /** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). @@ -795,8 +809,10 @@ * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels) * @{ */ -#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) -#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) +#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) +#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) /** @} */ // end of group LOAD_TEXTURE2D /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded. @@ -815,7 +831,7 @@ #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##0; \ - if(Y_MASK##0 != 0) \ + if (Y_MASK##0 != 0) \ BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ else \ BASENAME##0 = 0; @@ -824,7 +840,7 @@ LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##1; \ - if(Y_MASK##1 != 0) \ + if (Y_MASK##1 != 0) \ BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ else \ BASENAME##1 = 0; @@ -833,7 +849,7 @@ LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##2; \ - if(Y_MASK##2 != 0) \ + if (Y_MASK##2 != 0) \ BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ else \ BASENAME##2 = 0; @@ -842,7 +858,7 @@ LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##3; \ - if(Y_MASK##3 != 0) \ + if (Y_MASK##3 != 0) \ BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ else \ BASENAME##3 = 0; @@ -851,7 +867,7 @@ LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##4; \ - if(Y_MASK##4 != 0) \ + if (Y_MASK##4 != 0) \ BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ else \ BASENAME##4 = 0; @@ -860,7 +876,7 @@ LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##5; \ - if(Y_MASK##5 != 0) \ + if (Y_MASK##5 != 0) \ BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ else \ BASENAME##5 = 0; @@ -869,7 +885,7 @@ LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##6; \ - if(Y_MASK##6 != 0) \ + if (Y_MASK##6 != 0) \ BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ else \ BASENAME##6 = 0; @@ -878,7 +894,7 @@ LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##7; \ - if(Y_MASK##7 != 0) \ + if (Y_MASK##7 != 0) \ BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ else \ BASENAME##7 = 0; @@ -887,7 +903,7 @@ LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##8; \ - if(Y_MASK##8 != 0) \ + if (Y_MASK##8 != 0) \ BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ else \ BASENAME##8 = 0; @@ -896,7 +912,7 @@ LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##9; \ - if(Y_MASK##9 != 0) \ + if (Y_MASK##9 != 0) \ BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ else \ BASENAME##9 = 0; @@ -905,7 +921,7 @@ LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##A; \ - if(Y_MASK##A != 0) \ + if (Y_MASK##A != 0) \ BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ else \ BASENAME##A = 0; @@ -914,7 +930,7 @@ LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##B; \ - if(Y_MASK##B != 0) \ + if (Y_MASK##B != 0) \ BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ else \ BASENAME##B = 0; @@ -923,7 +939,7 @@ LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##C; \ - if(Y_MASK##C != 0) \ + if (Y_MASK##C != 0) \ BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ else \ BASENAME##C = 0; @@ -932,7 +948,7 @@ LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##D; \ - if(Y_MASK##D != 0) \ + if (Y_MASK##D != 0) \ BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ else \ BASENAME##D = 0; @@ -941,7 +957,7 @@ LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##E; \ - if(Y_MASK##E != 0) \ + if (Y_MASK##E != 0) \ BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ else \ BASENAME##E = 0; @@ -950,7 +966,7 @@ LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ VEC_DATA_TYPE(DATA_TYPE, N0) \ BASENAME##F; \ - if(Y_MASK##F != 0) \ + if (Y_MASK##F != 0) \ BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ else \ BASENAME##F = 0; @@ -976,8 +992,10 @@ * @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0 * @{ */ -#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) -#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) +#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ + LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) +#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ + LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) /** @} */ // end of group LOAD_BLOCK_INDIRECT /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). @@ -1088,8 +1106,10 @@ * @param[in] STRIDE_Y The stride in y-axis direction * @{ */ -#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) -#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) /** @} */ // end of group LOAD_SCALAR_AS_VECTOR /** Basic macros to calculate Z offset values from Z0 to Zn-1 @@ -1187,8 +1207,10 @@ * @param[in] STRIDE_Y The stride value in y-axis direction * @{ */ -#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) -#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) +#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ + CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) +#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ + CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) /** @} */ // end of group CALCULATE_Z_OFFSET /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1) @@ -1199,8 +1221,7 @@ * @param[in] SCALE The scale factor * @{ */ -#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ - BASENAME##0 *= (DATA_TYPE)SCALE; +#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE; #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ @@ -1275,7 +1296,7 @@ * @{ */ #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) -#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) +#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) /** @} */ // end of group SCALE_BLOCK /** Create a new vector containing the values at the given index for a set of given vectors @@ -1287,8 +1308,7 @@ * @param[in] TYPE The data type of the destination vectors * @{ */ -#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ - TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); +#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 2) \ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); @@ -1297,13 +1317,20 @@ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 4) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); -#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ - VEC_DATA_TYPE(TYPE, 8) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); -#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ - VEC_DATA_TYPE(TYPE, 16) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); +#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 8) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \ + (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); +#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \ + (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, \ + (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, \ + (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); /** @} */ // end of group COLUMN_VECTORn /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector @@ -1315,8 +1342,7 @@ * @param[in] TYPE The data type of the destination vectors * @{ */ -#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ - TYPE BASENAME##IDX_COL = (TYPE)((X##0)); +#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0)); #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 2) \ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); @@ -1329,9 +1355,10 @@ #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ VEC_DATA_TYPE(TYPE, 8) \ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); -#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ - VEC_DATA_TYPE(TYPE, 16) \ - BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); +#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \ + (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); /** @} */ // end of group COLUMN_VECTOR_SCALARn /** Create transposed vectors of the given vectors @@ -1343,8 +1370,7 @@ * @param[in] TYPE The data type of the transposed vectors * @{ */ -#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ - COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); +#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); @@ -1417,8 +1443,7 @@ * @param[in] BIAS The basename of the added variables * @{ */ -#define ADD_ROW_1(BASENAME, BIAS) \ - BASENAME##0 += BIAS##0; +#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0; #define ADD_ROW_2(BASENAME, BIAS) \ ADD_ROW_1(BASENAME, BIAS) \ @@ -1493,7 +1518,7 @@ * @{ */ #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) -#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) +#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) /** @} */ // end of group ADD_BLOCK /** Broadcast (add single value) to the each element of the destination variables @@ -1503,8 +1528,7 @@ * @param[in] BIAS The variable containing the value to add * @{ */ -#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ - BASENAME##0 += BIAS; +#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS; #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ @@ -1578,7 +1602,7 @@ * @{ */ #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) -#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) +#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) /** @} */ // end of group ADD_BLOCK_BROADCAST /** Apply activation to the given variables @@ -1668,8 +1692,10 @@ * @param[in] B_VAL Additional value required by the activation * @{ */ -#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) -#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) +#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) +#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) /** @} */ // end of group ACTIVATION_BLOCK /** Apply convert_<data_type> to the given variables @@ -1773,6 +1799,8 @@ * @param[in] BASENAME_DST The basename of the destination variables * @{ */ -#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) -#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) /** @} */ // end of group CONVERT_BLOCK diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index b2ceaf92f3..87a1875f93 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -81,11 +81,11 @@ * @return The reversed vector * @{ */ -#define REV1(x) ((x)) -#define REV2(x) ((x).s10) -#define REV3(x) ((x).s210) -#define REV4(x) ((x).s3210) -#define REV8(x) ((x).s76543210) +#define REV1(x) ((x)) +#define REV2(x) ((x).s10) +#define REV3(x) ((x).s210) +#define REV4(x) ((x).s3210) +#define REV8(x) ((x).s76543210) #define REV16(x) ((x).sFEDCBA9876543210) /** @} */ // end of group REVn @@ -99,7 +99,7 @@ * @{ */ #define REVERSE_STR(x, s) REV##s((x)) -#define REVERSE(x, s) REVERSE_STR(x, s) +#define REVERSE(x, s) REVERSE_STR(x, s) /** @} */ // end of group REVERSE /** Circular-right-shift (rotate-right) the vector of size s by the amount of n. @@ -138,16 +138,16 @@ #define ROT8_7(x) ((x).s12345670) #define ROT8_8(x) ((x)) -#define ROT16_0(x) ((x)) -#define ROT16_1(x) ((x).sF0123456789ABCDE) -#define ROT16_2(x) ((x).sEF0123456789ABCD) -#define ROT16_3(x) ((x).sDEF0123456789ABC) -#define ROT16_4(x) ((x).sCDEF0123456789AB) -#define ROT16_5(x) ((x).sBCDEF0123456789A) -#define ROT16_6(x) ((x).sABCDEF0123456789) -#define ROT16_7(x) ((x).s9ABCDEF012345678) -#define ROT16_8(x) ((x).s89ABCDEF01234567) -#define ROT16_9(x) ((x).s789ABCDEF0123456) +#define ROT16_0(x) ((x)) +#define ROT16_1(x) ((x).sF0123456789ABCDE) +#define ROT16_2(x) ((x).sEF0123456789ABCD) +#define ROT16_3(x) ((x).sDEF0123456789ABC) +#define ROT16_4(x) ((x).sCDEF0123456789AB) +#define ROT16_5(x) ((x).sBCDEF0123456789A) +#define ROT16_6(x) ((x).sABCDEF0123456789) +#define ROT16_7(x) ((x).s9ABCDEF012345678) +#define ROT16_8(x) ((x).s89ABCDEF01234567) +#define ROT16_9(x) ((x).s789ABCDEF0123456) #define ROT16_10(x) ((x).s6789ABCDEF012345) #define ROT16_11(x) ((x).s56789ABCDEF01234) #define ROT16_12(x) ((x).s456789ABCDEF0123) @@ -168,7 +168,7 @@ * @{ */ #define ROTATE_STR(x, s, n) ROT##s##_##n(x) -#define ROTATE(x, s, n) ROTATE_STR(x, s, n) +#define ROTATE(x, s, n) ROTATE_STR(x, s, n) /** @} */ // end of group ROTATE /** Creates a vector of size n filled with offset values corresponding to the location of each element. @@ -179,11 +179,11 @@ * @return The vector filled with offset values * @{ */ -#define V_OFFS1(dt) (dt##1)(0) -#define V_OFFS2(dt) (dt##2)(0, 1) -#define V_OFFS3(dt) (dt##3)(0, 1, 2) -#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) -#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) +#define V_OFFS1(dt) (dt##1)(0) +#define V_OFFS2(dt) (dt##2)(0, 1) +#define V_OFFS3(dt) (dt##3)(0, 1, 2) +#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) +#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) #define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) /** @} */ // end of group V_OFFSn @@ -197,11 +197,11 @@ * @{ */ #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) -#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) +#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) /** @} */ // end of group VEC_OFFS #define VLOAD_STR(size) vload##size -#define VLOAD(size) VLOAD_STR(size) +#define VLOAD(size) VLOAD_STR(size) /** Extended partial vload that correctly handles scalar values as well. * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops @@ -219,23 +219,23 @@ * @{ */ #define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size -#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) +#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) #define NO_LOAD(data, offs, ptr) \ { \ } // Size == 1 (scalar) -#define vload_partial_1_0 NO_LOAD -#define vload_partial_1_1 vload1 -#define vload_partial_1_2 NO_LOAD -#define vload_partial_1_3 NO_LOAD -#define vload_partial_1_4 NO_LOAD -#define vload_partial_1_5 NO_LOAD -#define vload_partial_1_6 NO_LOAD -#define vload_partial_1_7 NO_LOAD -#define vload_partial_1_8 NO_LOAD -#define vload_partial_1_9 NO_LOAD +#define vload_partial_1_0 NO_LOAD +#define vload_partial_1_1 vload1 +#define vload_partial_1_2 NO_LOAD +#define vload_partial_1_3 NO_LOAD +#define vload_partial_1_4 NO_LOAD +#define vload_partial_1_5 NO_LOAD +#define vload_partial_1_6 NO_LOAD +#define vload_partial_1_7 NO_LOAD +#define vload_partial_1_8 NO_LOAD +#define vload_partial_1_9 NO_LOAD #define vload_partial_1_10 NO_LOAD #define vload_partial_1_11 NO_LOAD #define vload_partial_1_12 NO_LOAD @@ -244,16 +244,16 @@ #define vload_partial_1_15 NO_LOAD #define vload_partial_1_16 NO_LOAD // Size == 2 -#define vload_partial_2_0 NO_LOAD -#define vload_partial_2_1 vload_partial_1 -#define vload_partial_2_2 vload_partial_2 -#define vload_partial_2_3 NO_LOAD -#define vload_partial_2_4 NO_LOAD -#define vload_partial_2_5 NO_LOAD -#define vload_partial_2_6 NO_LOAD -#define vload_partial_2_7 NO_LOAD -#define vload_partial_2_8 NO_LOAD -#define vload_partial_2_9 NO_LOAD +#define vload_partial_2_0 NO_LOAD +#define vload_partial_2_1 vload_partial_1 +#define vload_partial_2_2 vload_partial_2 +#define vload_partial_2_3 NO_LOAD +#define vload_partial_2_4 NO_LOAD +#define vload_partial_2_5 NO_LOAD +#define vload_partial_2_6 NO_LOAD +#define vload_partial_2_7 NO_LOAD +#define vload_partial_2_8 NO_LOAD +#define vload_partial_2_9 NO_LOAD #define vload_partial_2_10 NO_LOAD #define vload_partial_2_11 NO_LOAD #define vload_partial_2_12 NO_LOAD @@ -262,16 +262,16 @@ #define vload_partial_2_15 NO_LOAD #define vload_partial_2_16 NO_LOAD // Size == 3 -#define vload_partial_3_0 NO_LOAD -#define vload_partial_3_1 vload_partial_1 -#define vload_partial_3_2 vload_partial_2 -#define vload_partial_3_3 vload_partial_3 -#define vload_partial_3_4 NO_LOAD -#define vload_partial_3_5 NO_LOAD -#define vload_partial_3_6 NO_LOAD -#define vload_partial_3_7 NO_LOAD -#define vload_partial_3_8 NO_LOAD -#define vload_partial_3_9 NO_LOAD +#define vload_partial_3_0 NO_LOAD +#define vload_partial_3_1 vload_partial_1 +#define vload_partial_3_2 vload_partial_2 +#define vload_partial_3_3 vload_partial_3 +#define vload_partial_3_4 NO_LOAD +#define vload_partial_3_5 NO_LOAD +#define vload_partial_3_6 NO_LOAD +#define vload_partial_3_7 NO_LOAD +#define vload_partial_3_8 NO_LOAD +#define vload_partial_3_9 NO_LOAD #define vload_partial_3_10 NO_LOAD #define vload_partial_3_11 NO_LOAD #define vload_partial_3_12 NO_LOAD @@ -280,16 +280,16 @@ #define vload_partial_3_15 NO_LOAD #define vload_partial_3_16 NO_LOAD // Size == 4 -#define vload_partial_4_0 NO_LOAD -#define vload_partial_4_1 vload_partial_1 -#define vload_partial_4_2 vload_partial_2 -#define vload_partial_4_3 vload_partial_3 -#define vload_partial_4_4 vload_partial_4 -#define vload_partial_4_5 NO_LOAD -#define vload_partial_4_6 NO_LOAD -#define vload_partial_4_7 NO_LOAD -#define vload_partial_4_8 NO_LOAD -#define vload_partial_4_9 NO_LOAD +#define vload_partial_4_0 NO_LOAD +#define vload_partial_4_1 vload_partial_1 +#define vload_partial_4_2 vload_partial_2 +#define vload_partial_4_3 vload_partial_3 +#define vload_partial_4_4 vload_partial_4 +#define vload_partial_4_5 NO_LOAD +#define vload_partial_4_6 NO_LOAD +#define vload_partial_4_7 NO_LOAD +#define vload_partial_4_8 NO_LOAD +#define vload_partial_4_9 NO_LOAD #define vload_partial_4_10 NO_LOAD #define vload_partial_4_11 NO_LOAD #define vload_partial_4_12 NO_LOAD @@ -298,16 +298,16 @@ #define vload_partial_4_15 NO_LOAD #define vload_partial_4_16 NO_LOAD // Size == 8 -#define vload_partial_8_0 NO_LOAD -#define vload_partial_8_1 vload_partial_1 -#define vload_partial_8_2 vload_partial_2 -#define vload_partial_8_3 vload_partial_3 -#define vload_partial_8_4 vload_partial_4 -#define vload_partial_8_5 vload_partial_5 -#define vload_partial_8_6 vload_partial_6 -#define vload_partial_8_7 vload_partial_7 -#define vload_partial_8_8 vload_partial_8 -#define vload_partial_8_9 NO_LOAD +#define vload_partial_8_0 NO_LOAD +#define vload_partial_8_1 vload_partial_1 +#define vload_partial_8_2 vload_partial_2 +#define vload_partial_8_3 vload_partial_3 +#define vload_partial_8_4 vload_partial_4 +#define vload_partial_8_5 vload_partial_5 +#define vload_partial_8_6 vload_partial_6 +#define vload_partial_8_7 vload_partial_7 +#define vload_partial_8_8 vload_partial_8 +#define vload_partial_8_9 NO_LOAD #define vload_partial_8_10 NO_LOAD #define vload_partial_8_11 NO_LOAD #define vload_partial_8_12 NO_LOAD @@ -316,16 +316,16 @@ #define vload_partial_8_15 NO_LOAD #define vload_partial_8_16 NO_LOAD // Size == 16 -#define vload_partial_16_0 NO_LOAD -#define vload_partial_16_1 vload_partial_1 -#define vload_partial_16_2 vload_partial_2 -#define vload_partial_16_3 vload_partial_3 -#define vload_partial_16_4 vload_partial_4 -#define vload_partial_16_5 vload_partial_5 -#define vload_partial_16_6 vload_partial_6 -#define vload_partial_16_7 vload_partial_7 -#define vload_partial_16_8 vload_partial_8 -#define vload_partial_16_9 vload_partial_9 +#define vload_partial_16_0 NO_LOAD +#define vload_partial_16_1 vload_partial_1 +#define vload_partial_16_2 vload_partial_2 +#define vload_partial_16_3 vload_partial_3 +#define vload_partial_16_4 vload_partial_4 +#define vload_partial_16_5 vload_partial_5 +#define vload_partial_16_6 vload_partial_6 +#define vload_partial_16_7 vload_partial_7 +#define vload_partial_16_8 vload_partial_8 +#define vload_partial_16_9 vload_partial_9 #define vload_partial_16_10 vload_partial_10 #define vload_partial_16_11 vload_partial_11 #define vload_partial_16_12 vload_partial_12 @@ -351,17 +351,13 @@ * @param[in] PTR The base pointer * @{ */ -#define vload_partial_1(DATA, OFFSET, PTR) \ - DATA.s0 = vload1(OFFSET, PTR); +#define vload_partial_1(DATA, OFFSET, PTR) DATA.s0 = vload1(OFFSET, PTR); -#define vload_partial_2(DATA, OFFSET, PTR) \ - DATA.s01 = vload2(OFFSET, PTR); +#define vload_partial_2(DATA, OFFSET, PTR) DATA.s01 = vload2(OFFSET, PTR); -#define vload_partial_3(DATA, OFFSET, PTR) \ - DATA.s012 = vload3(OFFSET, PTR); +#define vload_partial_3(DATA, OFFSET, PTR) DATA.s012 = vload3(OFFSET, PTR); -#define vload_partial_4(DATA, OFFSET, PTR) \ - DATA.s0123 = vload4(OFFSET, PTR); +#define vload_partial_4(DATA, OFFSET, PTR) DATA.s0123 = vload4(OFFSET, PTR); #define vload_partial_5(DATA, OFFSET, PTR) \ vload_partial_4(DATA.s0123, OFFSET, PTR); \ @@ -375,8 +371,7 @@ vload_partial_4(DATA.s0123, OFFSET, PTR); \ vload_partial_3(DATA.s456, OFFSET, PTR + 4); -#define vload_partial_8(DATA, OFFSET, PTR) \ - DATA.s01234567 = vload8(OFFSET, PTR); +#define vload_partial_8(DATA, OFFSET, PTR) DATA.s01234567 = vload8(OFFSET, PTR); #define vload_partial_9(DATA, OFFSET, PTR) \ vload_partial_8(DATA.s01234567, OFFSET, PTR); \ @@ -406,13 +401,12 @@ vload_partial_8(DATA.s01234567, OFFSET, PTR); \ vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); -#define vload_partial_16(DATA, OFFSET, PTR) \ - DATA = vload16(OFFSET, PTR); +#define vload_partial_16(DATA, OFFSET, PTR) DATA = vload16(OFFSET, PTR); /** @} */ // end of groupd vload_partial_n /** @} */ // end of groupd VLOAD_PARTIAL -#define PIXEL_UNIT4 1 -#define PIXEL_UNIT8 2 +#define PIXEL_UNIT4 1 +#define PIXEL_UNIT8 2 #define PIXEL_UNIT16 4 /** Utility macro to convert a vector size in pixel unit. @@ -425,27 +419,45 @@ * @{ */ #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size -#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) +#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) /** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT #define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); -#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); -#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); +#define read_image2d_floatx2(img, x_coord, y_coord) \ + (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); +#define read_image2d_floatx4(img, x_coord, y_coord) \ + (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), \ + read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); -#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); -#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); +#define read_image2d_halfx2(img, x_coord, y_coord) \ + (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); +#define read_image2d_halfx4(img, x_coord, y_coord) \ + (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), \ + read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); -#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); -#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); +#define write_image2d_floatx2(img, x_coord, y_coord, values) \ + (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_floatx4(img, x_coord, y_coord, values) \ + (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), \ + write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), \ + write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); -#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); -#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); +#define write_image2d_halfx2(img, x_coord, y_coord, values) \ + (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_halfx4(img, x_coord, y_coord, values) \ + (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \ + write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), \ + write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), \ + write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) /** Utility macro to read a 2D OpenCL image object. @@ -462,7 +474,7 @@ * @{ */ #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) -#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) +#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) /** @} */ /** Utility macro to write a 2D OpenCL image object. @@ -478,26 +490,28 @@ * * @{ */ -#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) -#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) +#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) \ + write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) +#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) \ + WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) /** @} */ #define VSTORE_STR(size) vstore##size -#define VSTORE(size) VSTORE_STR(size) +#define VSTORE(size) VSTORE_STR(size) -#define float1 float -#define half1 half -#define char1 char -#define uchar1 uchar -#define short1 short +#define float1 float +#define half1 half +#define char1 char +#define uchar1 uchar +#define short1 short #define ushort1 ushort -#define int1 int -#define uint1 uint -#define long1 long -#define ulong1 ulong +#define int1 int +#define uint1 uint +#define long1 long +#define ulong1 ulong #define double1 double -#define vload1(OFFSET, PTR) *(OFFSET + PTR) +#define vload1(OFFSET, PTR) *(OFFSET + PTR) #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA /** Extended partial vstore that correctly handles scalar values as well. @@ -516,23 +530,23 @@ * @{ */ #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size -#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) +#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) #define NO_STORE(data, offs, ptr) \ { \ } // Size == 1 (scalar) -#define vstore_partial_1_0 NO_STORE -#define vstore_partial_1_1 vstore1 -#define vstore_partial_1_2 NO_STORE -#define vstore_partial_1_3 NO_STORE -#define vstore_partial_1_4 NO_STORE -#define vstore_partial_1_5 NO_STORE -#define vstore_partial_1_6 NO_STORE -#define vstore_partial_1_7 NO_STORE -#define vstore_partial_1_8 NO_STORE -#define vstore_partial_1_9 NO_STORE +#define vstore_partial_1_0 NO_STORE +#define vstore_partial_1_1 vstore1 +#define vstore_partial_1_2 NO_STORE +#define vstore_partial_1_3 NO_STORE +#define vstore_partial_1_4 NO_STORE +#define vstore_partial_1_5 NO_STORE +#define vstore_partial_1_6 NO_STORE +#define vstore_partial_1_7 NO_STORE +#define vstore_partial_1_8 NO_STORE +#define vstore_partial_1_9 NO_STORE #define vstore_partial_1_10 NO_STORE #define vstore_partial_1_11 NO_STORE #define vstore_partial_1_12 NO_STORE @@ -541,16 +555,16 @@ #define vstore_partial_1_15 NO_STORE #define vstore_partial_1_16 NO_STORE // Size == 2 -#define vstore_partial_2_0 NO_STORE -#define vstore_partial_2_1 vstore_partial_1 -#define vstore_partial_2_2 vstore_partial_2 -#define vstore_partial_2_3 NO_STORE -#define vstore_partial_2_4 NO_STORE -#define vstore_partial_2_5 NO_STORE -#define vstore_partial_2_6 NO_STORE -#define vstore_partial_2_7 NO_STORE -#define vstore_partial_2_8 NO_STORE -#define vstore_partial_2_9 NO_STORE +#define vstore_partial_2_0 NO_STORE +#define vstore_partial_2_1 vstore_partial_1 +#define vstore_partial_2_2 vstore_partial_2 +#define vstore_partial_2_3 NO_STORE +#define vstore_partial_2_4 NO_STORE +#define vstore_partial_2_5 NO_STORE +#define vstore_partial_2_6 NO_STORE +#define vstore_partial_2_7 NO_STORE +#define vstore_partial_2_8 NO_STORE +#define vstore_partial_2_9 NO_STORE #define vstore_partial_2_10 NO_STORE #define vstore_partial_2_11 NO_STORE #define vstore_partial_2_12 NO_STORE @@ -559,16 +573,16 @@ #define vstore_partial_2_15 NO_STORE #define vstore_partial_2_16 NO_STORE // Size == 3 -#define vstore_partial_3_0 NO_STORE -#define vstore_partial_3_1 vstore_partial_1 -#define vstore_partial_3_2 vstore_partial_2 -#define vstore_partial_3_3 vstore_partial_3 -#define vstore_partial_3_4 NO_STORE -#define vstore_partial_3_5 NO_STORE -#define vstore_partial_3_6 NO_STORE -#define vstore_partial_3_7 NO_STORE -#define vstore_partial_3_8 NO_STORE -#define vstore_partial_3_9 NO_STORE +#define vstore_partial_3_0 NO_STORE +#define vstore_partial_3_1 vstore_partial_1 +#define vstore_partial_3_2 vstore_partial_2 +#define vstore_partial_3_3 vstore_partial_3 +#define vstore_partial_3_4 NO_STORE +#define vstore_partial_3_5 NO_STORE +#define vstore_partial_3_6 NO_STORE +#define vstore_partial_3_7 NO_STORE +#define vstore_partial_3_8 NO_STORE +#define vstore_partial_3_9 NO_STORE #define vstore_partial_3_10 NO_STORE #define vstore_partial_3_11 NO_STORE #define vstore_partial_3_12 NO_STORE @@ -577,16 +591,16 @@ #define vstore_partial_3_15 NO_STORE #define vstore_partial_3_16 NO_STORE // Size == 4 -#define vstore_partial_4_0 NO_STORE -#define vstore_partial_4_1 vstore_partial_1 -#define vstore_partial_4_2 vstore_partial_2 -#define vstore_partial_4_3 vstore_partial_3 -#define vstore_partial_4_4 vstore_partial_4 -#define vstore_partial_4_5 NO_STORE -#define vstore_partial_4_6 NO_STORE -#define vstore_partial_4_7 NO_STORE -#define vstore_partial_4_8 NO_STORE -#define vstore_partial_4_9 NO_STORE +#define vstore_partial_4_0 NO_STORE +#define vstore_partial_4_1 vstore_partial_1 +#define vstore_partial_4_2 vstore_partial_2 +#define vstore_partial_4_3 vstore_partial_3 +#define vstore_partial_4_4 vstore_partial_4 +#define vstore_partial_4_5 NO_STORE +#define vstore_partial_4_6 NO_STORE +#define vstore_partial_4_7 NO_STORE +#define vstore_partial_4_8 NO_STORE +#define vstore_partial_4_9 NO_STORE #define vstore_partial_4_10 NO_STORE #define vstore_partial_4_11 NO_STORE #define vstore_partial_4_12 NO_STORE @@ -595,16 +609,16 @@ #define vstore_partial_4_15 NO_STORE #define vstore_partial_4_16 NO_STORE // Size == 8 -#define vstore_partial_8_0 NO_STORE -#define vstore_partial_8_1 vstore_partial_1 -#define vstore_partial_8_2 vstore_partial_2 -#define vstore_partial_8_3 vstore_partial_3 -#define vstore_partial_8_4 vstore_partial_4 -#define vstore_partial_8_5 vstore_partial_5 -#define vstore_partial_8_6 vstore_partial_6 -#define vstore_partial_8_7 vstore_partial_7 -#define vstore_partial_8_8 vstore_partial_8 -#define vstore_partial_8_9 NO_STORE +#define vstore_partial_8_0 NO_STORE +#define vstore_partial_8_1 vstore_partial_1 +#define vstore_partial_8_2 vstore_partial_2 +#define vstore_partial_8_3 vstore_partial_3 +#define vstore_partial_8_4 vstore_partial_4 +#define vstore_partial_8_5 vstore_partial_5 +#define vstore_partial_8_6 vstore_partial_6 +#define vstore_partial_8_7 vstore_partial_7 +#define vstore_partial_8_8 vstore_partial_8 +#define vstore_partial_8_9 NO_STORE #define vstore_partial_8_10 NO_STORE #define vstore_partial_8_11 NO_STORE #define vstore_partial_8_12 NO_STORE @@ -613,16 +627,16 @@ #define vstore_partial_8_15 NO_STORE #define vstore_partial_8_16 NO_STORE // Size == 16 -#define vstore_partial_16_0 NO_STORE -#define vstore_partial_16_1 vstore_partial_1 -#define vstore_partial_16_2 vstore_partial_2 -#define vstore_partial_16_3 vstore_partial_3 -#define vstore_partial_16_4 vstore_partial_4 -#define vstore_partial_16_5 vstore_partial_5 -#define vstore_partial_16_6 vstore_partial_6 -#define vstore_partial_16_7 vstore_partial_7 -#define vstore_partial_16_8 vstore_partial_8 -#define vstore_partial_16_9 vstore_partial_9 +#define vstore_partial_16_0 NO_STORE +#define vstore_partial_16_1 vstore_partial_1 +#define vstore_partial_16_2 vstore_partial_2 +#define vstore_partial_16_3 vstore_partial_3 +#define vstore_partial_16_4 vstore_partial_4 +#define vstore_partial_16_5 vstore_partial_5 +#define vstore_partial_16_6 vstore_partial_6 +#define vstore_partial_16_7 vstore_partial_7 +#define vstore_partial_16_8 vstore_partial_8 +#define vstore_partial_16_9 vstore_partial_9 #define vstore_partial_16_10 vstore_partial_10 #define vstore_partial_16_11 vstore_partial_11 #define vstore_partial_16_12 vstore_partial_12 @@ -648,17 +662,13 @@ * @param[in] PTR The base pointer * @{ */ -#define vstore_partial_1(DATA, OFFSET, PTR) \ - vstore1(DATA.s0, OFFSET, PTR); +#define vstore_partial_1(DATA, OFFSET, PTR) vstore1(DATA.s0, OFFSET, PTR); -#define vstore_partial_2(DATA, OFFSET, PTR) \ - vstore2(DATA.s01, OFFSET, PTR); +#define vstore_partial_2(DATA, OFFSET, PTR) vstore2(DATA.s01, OFFSET, PTR); -#define vstore_partial_3(DATA, OFFSET, PTR) \ - vstore3(DATA.s012, OFFSET, PTR); +#define vstore_partial_3(DATA, OFFSET, PTR) vstore3(DATA.s012, OFFSET, PTR); -#define vstore_partial_4(DATA, OFFSET, PTR) \ - vstore4(DATA.s0123, OFFSET, PTR); +#define vstore_partial_4(DATA, OFFSET, PTR) vstore4(DATA.s0123, OFFSET, PTR); #define vstore_partial_5(DATA, OFFSET, PTR) \ vstore_partial_4(DATA.s0123, OFFSET, PTR); \ @@ -672,8 +682,7 @@ vstore_partial_4(DATA.s0123, OFFSET, PTR); \ vstore_partial_3(DATA.s456, OFFSET, PTR + 4); -#define vstore_partial_8(DATA, OFFSET, PTR) \ - vstore8(DATA.s01234567, OFFSET, PTR); +#define vstore_partial_8(DATA, OFFSET, PTR) vstore8(DATA.s01234567, OFFSET, PTR); #define vstore_partial_9(DATA, OFFSET, PTR) \ vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ @@ -703,186 +712,156 @@ vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); -#define vstore_partial_16(DATA, OFFSET, PTR) \ - vstore16(DATA, OFFSET, PTR); +#define vstore_partial_16(DATA, OFFSET, PTR) vstore16(DATA, OFFSET, PTR); /** @} */ // end of groupd vstore_partial_n /** @} */ // end of groupd VSTORE_PARTIAL // Convert built-in functions with _sat modifier are not supported in floating point so we create defines // without _sat to overcome this issue -#define convert_float_sat convert_float -#define convert_float1_sat convert_float -#define convert_float2_sat convert_float2 -#define convert_float3_sat convert_float3 -#define convert_float4_sat convert_float4 -#define convert_float8_sat convert_float8 +#define convert_float_sat convert_float +#define convert_float1_sat convert_float +#define convert_float2_sat convert_float2 +#define convert_float3_sat convert_float3 +#define convert_float4_sat convert_float4 +#define convert_float8_sat convert_float8 #define convert_float16_sat convert_float16 -#define convert_half_sat convert_float -#define convert_half1_sat convert_half -#define convert_half2_sat convert_half2 -#define convert_half3_sat convert_half3 -#define convert_half4_sat convert_half4 -#define convert_half8_sat convert_half8 -#define convert_half16_sat convert_half16 - -#define convert_float1 convert_float -#define convert_half1 convert_half -#define convert_char1 convert_char -#define convert_uchar1 convert_uchar -#define convert_short1 convert_short +#define convert_half_sat convert_float +#define convert_half1_sat convert_half +#define convert_half2_sat convert_half2 +#define convert_half3_sat convert_half3 +#define convert_half4_sat convert_half4 +#define convert_half8_sat convert_half8 +#define convert_half16_sat convert_half16 + +#define convert_float1 convert_float +#define convert_half1 convert_half +#define convert_char1 convert_char +#define convert_uchar1 convert_uchar +#define convert_short1 convert_short #define convert_ushort1 convert_ushort -#define convert_int1 convert_int -#define convert_uint1 convert_uint -#define convert_long1 convert_long -#define convert_ulong1 convert_ulong +#define convert_int1 convert_int +#define convert_uint1 convert_uint +#define convert_long1 convert_long +#define convert_ulong1 convert_ulong #define convert_double1 convert_double -#define convert_char1_sat convert_char_sat -#define convert_uchar1_sat convert_uchar_sat -#define convert_uchar2_sat convert_uchar2_sat -#define convert_uchar3_sat convert_uchar3_sat -#define convert_uchar4_sat convert_uchar4_sat -#define convert_uchar8_sat convert_uchar8_sat +#define convert_char1_sat convert_char_sat +#define convert_uchar1_sat convert_uchar_sat +#define convert_uchar2_sat convert_uchar2_sat +#define convert_uchar3_sat convert_uchar3_sat +#define convert_uchar4_sat convert_uchar4_sat +#define convert_uchar8_sat convert_uchar8_sat #define convert_uchar16_sat convert_uchar16_sat -#define convert_short1_sat convert_short_sat +#define convert_short1_sat convert_short_sat #define convert_ushort1_sat convert_ushort_sat -#define convert_int1_sat convert_int_sat -#define convert_uint1_sat convert_uint_sat -#define convert_long1_sat convert_long_sat -#define convert_ulong1_sat convert_ulong_sat +#define convert_int1_sat convert_int_sat +#define convert_uint1_sat convert_uint_sat +#define convert_long1_sat convert_long_sat +#define convert_ulong1_sat convert_ulong_sat #define convert_double1_sat convert_double_sat #define VEC_DATA_TYPE_STR(type, size) type##size -#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) #define CONVERT_STR(x, type) (convert_##type((x))) -#define CONVERT(x, type) CONVERT_STR(x, type) +#define CONVERT(x, type) CONVERT_STR(x, type) #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) -#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) -#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) +#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) -#define select_vec_dt_uchar(size) uchar##size -#define select_vec_dt_char(size) char##size +#define select_vec_dt_uchar(size) uchar##size +#define select_vec_dt_char(size) char##size #define select_vec_dt_ushort(size) ushort##size -#define select_vec_dt_short(size) short##size -#define select_vec_dt_half(size) short##size -#define select_vec_dt_uint(size) uint##size -#define select_vec_dt_int(size) int##size -#define select_vec_dt_float(size) int##size -#define select_vec_dt_ulong(size) ulong##size -#define select_vec_dt_long(size) long##size +#define select_vec_dt_short(size) short##size +#define select_vec_dt_half(size) short##size +#define select_vec_dt_uint(size) uint##size +#define select_vec_dt_int(size) int##size +#define select_vec_dt_float(size) int##size +#define select_vec_dt_ulong(size) ulong##size +#define select_vec_dt_long(size) long##size #define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) -#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) -#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) +#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) +#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) -#define signed_int_vec_dt_uchar(size) char##size -#define signed_int_vec_dt_char(size) char##size +#define signed_int_vec_dt_uchar(size) char##size +#define signed_int_vec_dt_char(size) char##size #define signed_int_vec_dt_ushort(size) short##size -#define signed_int_vec_dt_short(size) short##size -#define signed_int_vec_dt_half(size) short##size -#define signed_int_vec_dt_uint(size) int##size -#define signed_int_vec_dt_int(size) int##size -#define signed_int_vec_dt_float(size) int##size -#define signed_int_vec_dt_ulong(size) long##size -#define signed_int_vec_dt_long(size) long##size +#define signed_int_vec_dt_short(size) short##size +#define signed_int_vec_dt_half(size) short##size +#define signed_int_vec_dt_uint(size) int##size +#define signed_int_vec_dt_int(size) int##size +#define signed_int_vec_dt_float(size) int##size +#define signed_int_vec_dt_ulong(size) long##size +#define signed_int_vec_dt_long(size) long##size #define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) -#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) -#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) - -#define sum_reduce_1(x) (x) -#define sum_reduce_2(x) ((x).s0) + ((x).s1) -#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) -#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) -#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) +#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) +#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) + +#define sum_reduce_1(x) (x) +#define sum_reduce_2(x) ((x).s0) + ((x).s1) +#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) +#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) +#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) #define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) -#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) +#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) -#define prod_reduce_1(x) (x) -#define prod_reduce_2(x) ((x).s0) * ((x).s1) -#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) -#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) -#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) +#define prod_reduce_1(x) (x) +#define prod_reduce_2(x) ((x).s0) * ((x).s1) +#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) +#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) +#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) #define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) #define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) -#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) +#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) -#define max_reduce_1(x) (x) -#define max_reduce_2(x) max(((x).s0), ((x).s1)) -#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) -#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) -#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) +#define max_reduce_1(x) (x) +#define max_reduce_2(x) max(((x).s0), ((x).s1)) +#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) +#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) +#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) #define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) #define MAX_REDUCE_STR(x, size) max_reduce_##size(x) -#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) +#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) -#define min_reduce_1(x) (x) -#define min_reduce_2(x) min(((x).s0), ((x).s1)) -#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2)) -#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23)) -#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567)) +#define min_reduce_1(x) (x) +#define min_reduce_2(x) min(((x).s0), ((x).s1)) +#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2)) +#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23)) +#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567)) #define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF)) #define MIN_REDUCE_STR(x, size) min_reduce_##size(x) -#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size) - -#define VECTOR_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_offset_first_element_in_bytes - -#define IMAGE_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR3D_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_stride_z, \ - uint name##_step_z, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR4D_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_stride_z, \ - uint name##_step_z, \ - uint name##_stride_w, \ - uint name##_step_w, \ - uint name##_offset_first_element_in_bytes - -#define TENSOR5D_DECLARATION(name) \ - __global uchar *name##_ptr, \ - uint name##_stride_x, \ - uint name##_step_x, \ - uint name##_stride_y, \ - uint name##_step_y, \ - uint name##_stride_z, \ - uint name##_step_z, \ - uint name##_stride_w, \ - uint name##_step_w, \ - uint name##_stride_v, \ - uint name##_step_v, \ - uint name##_offset_first_element_in_bytes +#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size) + +#define VECTOR_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_offset_first_element_in_bytes + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR3D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_stride_z, uint name##_step_z, uint name##_offset_first_element_in_bytes + +#define TENSOR4D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR5D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \ + uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, uint name##_stride_v, \ + uint name##_step_v, uint name##_offset_first_element_in_bytes #define CONVERT_TO_VECTOR_STRUCT(name) \ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) @@ -890,38 +869,47 @@ #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) -#define CONVERT_TO_IMAGE_STRUCT(name) \ - update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y) #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0, name##_stride_z, name##_step_z) -#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ - update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) -#define CONVERT_TO_TENSOR3D_STRUCT(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ - name##_stride_z, name##_step_z) +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) -#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ - update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0, name##_stride_z, 0) -#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ - name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) +#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y, name##_stride_z, name##_step_z, name##_stride_w, \ + name##_step_w, mod_size) -#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ - update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) +#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) -#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ - tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ - name##_stride_z, name##_step_z) +#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ + tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \ + name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) /** Structure to hold Vector information */ typedef struct Vector @@ -970,10 +958,10 @@ typedef struct Tensor4D * * @return An image object */ -inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) +inline Vector +update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) { - Vector vector = - { + Vector vector = { .ptr = ptr, .offset_first_element_in_bytes = offset_first_element_in_bytes, .stride_x = stride_x, @@ -993,15 +981,13 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_ * * @return An image object */ -inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) +inline Image update_image_workitem_ptr( + __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) { - Image img = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y - }; + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; return img; } @@ -1019,16 +1005,21 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el * * @return A 3D tensor object */ -inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z) { - Image img = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y - }; - img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + + get_global_id(2) * step_z; return img; } @@ -1045,17 +1036,22 @@ inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint o * * @return A 3D tensor object */ -inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z) { - Tensor3D tensor = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z - }; - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + + get_global_id(2) * step_z; return tensor; } @@ -1072,34 +1068,44 @@ inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_fi * * @return A 3D tensor object */ -inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z) { - Tensor3D tensor = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z - }; + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; return tensor; } -inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, - uint step_w, - uint mod_size) +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, + uint step_x, + uint stride_y, + uint step_y, + uint stride_z, + uint step_z, + uint stride_w, + uint step_w, + uint mod_size) { - Tensor4D tensor = - { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z, - .stride_w = stride_w - }; - - tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; + Tensor4D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z, + .stride_w = stride_w}; + + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; return tensor; } @@ -1171,7 +1177,8 @@ inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint wid const uint x = index; - return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + + tensor->offset_first_element_in_bytes; } #endif // _HELPER_H diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index 562c5d3236..166260a3c0 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -34,7 +34,7 @@ * @return The converted vector */ #define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) -#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) +#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) /** Quantize a floating-point scalar value to 8-bit asymmetric * @@ -84,14 +84,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return quantized values */ -#define QUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ - { \ - VEC_DATA_TYPE(float, size) \ - out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ - VEC_DATA_TYPE(type, size) \ - res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ - return res; \ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ + return res; \ } /** Dequantize a vector of values to floating-point @@ -101,10 +102,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return dequantized values in floating point */ -#define DEQUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ - { \ - return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ } /** Correctly-rounded-to-nearest division by a power-of-two. @@ -113,18 +115,17 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Correctly-rounded-to-nearest division by a power-of-two. */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ - { \ - const VEC_DATA_TYPE(int, size) \ - zero = (VEC_DATA_TYPE(int, size))0; \ - const VEC_DATA_TYPE(int, size) \ - one = (VEC_DATA_TYPE(int, size))1; \ - VEC_DATA_TYPE(int, size) \ - mask = (one << exponent) - one; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ - return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ + const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + mask = (one << exponent) - one; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ + return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ } /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), @@ -167,27 +168,29 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Result in fixed-point format Q0. */ -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ - { \ - const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ - const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ - const int k_fractional_bits = 31; \ - VEC_DATA_TYPE(int, size) \ - x = a + (1 << (k_fractional_bits - 3)); \ - VEC_DATA_TYPE(int, size) \ - x2 = ASYMM_MULT(x, x, size); \ - VEC_DATA_TYPE(int, size) \ - x3 = ASYMM_MULT(x2, x, size); \ - VEC_DATA_TYPE(int, size) \ - x4 = ASYMM_MULT(x2, x2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ - return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ } /** Each bit of the result is set to the corresponding bit of either then_val or @@ -198,10 +201,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not. */ -#define ASYMM_SELECT_USING_MASK_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ - { \ - return (if_mask & then_val) ^ (~if_mask & else_val); \ +#define ASYMM_SELECT_USING_MASK_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size( \ + VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ + { \ + return (if_mask & then_val) ^ (~if_mask & else_val); \ } /** For each element of input vector, the corresponding bits of the result item are set @@ -234,18 +238,19 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \ } -#define EXP_BARREL_SHIFTER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ - { \ - if(k_integer_bits > exponent) \ - { \ - const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ - return ASYMM_SELECT_USING_MASK( \ - ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ - ASYMM_MULT(result, fp_multiplier, size), result, size); \ - } \ - \ - return result; \ +#define EXP_BARREL_SHIFTER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + { \ + if (k_integer_bits > exponent) \ + { \ + const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ + } \ + \ + return result; \ } /** Calculates \f$ exp(x) \f$ for x < 0. @@ -254,39 +259,40 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Result in fixed-point format Q0. */ -#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ - { \ - const int k_fractional_bits = 31 - k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - k_one_quarter = 1 << (k_fractional_bits - 2); \ - VEC_DATA_TYPE(int, size) \ - mask = k_one_quarter - 1; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ - VEC_DATA_TYPE(int, size) \ - a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ - VEC_DATA_TYPE(int, size) \ - result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \ - VEC_DATA_TYPE(int, size) \ - remainder = a_mod_quarter_minus_one_quarter - a; \ - \ - result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ - \ - if(k_integer_bits > 5) \ - { \ - const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ - result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ - return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ +#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + { \ + const int k_fractional_bits = 31 - k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + k_one_quarter = 1 << (k_fractional_bits - 2); \ + VEC_DATA_TYPE(int, size) \ + mask = k_one_quarter - 1; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, \ + size); \ + VEC_DATA_TYPE(int, size) \ + remainder = a_mod_quarter_minus_one_quarter - a; \ + \ + result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + \ + if (k_integer_bits > 5) \ + { \ + const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ + result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ } /** Calculates the product of a integer value by a power of two, with either a positive exponent @@ -297,26 +303,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Arithmetic left or right shift. */ -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - if(exponent < 0) \ - { \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) min = INT_MIN; \ - const VEC_DATA_TYPE(int, size) max = INT_MAX; \ - int threshold = ((1 << (31 - exponent)) - 1); \ - VEC_DATA_TYPE(int, size) \ - positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ - VEC_DATA_TYPE(int, size) \ - negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ - VEC_DATA_TYPE(int, size) \ - result = x << exponent; \ - result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ - result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ - return result; \ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ } /** Calculates (a+b)/2, rounded to the nearest integer. @@ -326,20 +333,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return (a+b)/2, rounded to the nearest integer. */ -#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(long, size) \ - a64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - sum = a64 + b64; \ - const VEC_DATA_TYPE(long, size) one = 1; \ - const VEC_DATA_TYPE(long, size) minus_one = -1; \ - VEC_DATA_TYPE(long, size) \ - sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ - return convert_int##size((sum + sign) / 2); \ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ + return convert_int##size((sum + sign) / 2); \ } /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). @@ -354,12 +362,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ VEC_DATA_TYPE(int, size) \ - half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ + half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ VEC_DATA_TYPE(int, size) \ x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ - for(int i = 0; i < 3; i++) \ + for (int i = 0; i < 3; i++) \ { \ VEC_DATA_TYPE(int, size) \ half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ @@ -378,48 +386,57 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Rescaled value. */ -#define ASYMM_RESCALE_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ - { \ - int exponent = src_integer_bits - dst_integer_bits; \ - return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ +#define ASYMM_RESCALE_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ + { \ + int exponent = src_integer_bits - dst_integer_bits; \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ } -#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) -#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) +#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) +#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) #define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale) -#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) +#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) #define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) -#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) -#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) -#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) +#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) +#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) -#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val) -#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) +#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ + asymm_select_using_mask##size(if_mask, then_val, else_val) +#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) #define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) -#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) +#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) \ + exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) #define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits) -#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) -#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent) +#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ + asymm_saturating_rounding_mult_by_pow2##size(x, exponent) #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) -#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits) -#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) - -#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ - { \ - const int left_shift = shift > 0 ? shift : 0; \ - const int right_shift = shift > 0 ? 0 : -shift; \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ +#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) \ + asymm_rescale##size(value, src_integer_bits, dst_integer_bits) +#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ + ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) + +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ } -#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift) +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ + multiply_by_quantized_multiplier##size(input, qmul, shift) QUANTIZE_IMPL(uchar, 1) QUANTIZE_IMPL(char, 1) diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h index 4ba2b2ca3a..4daf0adc89 100644 --- a/src/core/CL/cl_kernels/load_store_utility.h +++ b/src/core/CL/cl_kernels/load_store_utility.h @@ -223,8 +223,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** @} */ // end of group STORE_BLOCK /** Convert and store a block of the given size M0xN0 @@ -245,8 +247,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** @} */ // end of group CONVERT_STORE_BLOCK /** Partially store the 0 to (n-1)th rows of the given variables @@ -365,8 +369,10 @@ * @param[in] Z The offset in z-axis direction * @{ */ -#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) -#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) /** Store a block that can be partial in both x and y dimensions * * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty. @@ -388,22 +394,23 @@ * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. */ -#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ - { \ - STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ - { \ - STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ - { \ - STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ - } \ - else \ - { \ - STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ +#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, \ + PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ + if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ + { \ + STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ + { \ + STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ + { \ + STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ + } \ + else \ + { \ + STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } /** Store a block that can only be partial in x but not y. * @@ -425,7 +432,7 @@ * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0. */ #define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ - if(!(PARTIAL_COND_X)) \ + if (!(PARTIAL_COND_X)) \ { \ STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } \ @@ -453,7 +460,7 @@ * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. */ #define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ - if(!(PARTIAL_COND_Y)) \ + if (!(PARTIAL_COND_Y)) \ { \ STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ } \ @@ -517,23 +524,28 @@ #if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case1: No partial blocks in either x or y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 // Case2: Partial blocks in y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 // Case3: Partial blocks in x -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 // Case4: Partial blocks in both x and y -#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ - STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) +#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) \ + STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \ + PARTIAL_COND_Y, PARTIAL_COND_X) #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 @@ -560,8 +572,7 @@ #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) #else // defined(PARTIAL_STORE_M0) -#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ - ((uint)(y * M0)) +#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) ((uint)(y * M0)) #endif // defined(PARTIAL_STORE_M0) /** @} */ // end of group COMPUTE_M0_START_ROW diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h index bed94a7b3b..cb2f4b0319 100644 --- a/src/core/CL/cl_kernels/repeat.h +++ b/src/core/CL/cl_kernels/repeat.h @@ -75,7 +75,9 @@ P_X##_DEF(F, P_A, P_B, P_C); \ REPEAT_3_15(P_X, P_A, P_B, P_C) -#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM +#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \ + REPEAT_3_##P_NUM(P_OP, P_A, P_B, \ + P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM #define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) // Repeat macros with 4 param, excluding the implicit ID param @@ -126,52 +128,59 @@ P_X##_DEF(F, P_A, P_B, P_C, P_D); \ REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) -#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM +#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \ + REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, \ + P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM #define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) // Macro for initializing N variables. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) -#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL +#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) -#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) +#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) #define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) // Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) -#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) +#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \ + REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) // Macro for adding a constant to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL +#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL #define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) // Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...) -#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL +#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL #define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) // Macro for adding a vector to N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) #define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC -#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) +#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) // Macro for adding a two N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) #define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID -#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) +#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) // Macro for performing Max between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) +#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) #define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) // Macro for performing Min between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) +#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) #define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) -#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...) -#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) -#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) // Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables. #define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ @@ -182,6 +191,7 @@ VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ }) -#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) #endif // ARM_COMPUTE_REPEAT_H diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h index 642483ab3c..6595bd1981 100644 --- a/src/core/CL/cl_kernels/warp_helpers.h +++ b/src/core/CL/cl_kernels/warp_helpers.h @@ -31,11 +31,13 @@ * @param[in] border_size Border size of the image * */ -inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size) +inline const float8 +clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size) { const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size); const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size); - return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3); + return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, + clamped_y.s3); } /** Clamps the given coordinates to the borders. @@ -74,7 +76,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int */ inline const float8 get_neighbour_coords(const float2 coord) { - return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1); + return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, + /*br*/ coord.s0 + 1, coord.s1 + 1); } /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values @@ -85,37 +88,38 @@ inline const float8 get_neighbour_coords(const float2 coord) * @param[in] height Height of the image * @param[in] border_size Border size */ -inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size) +inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border( + const Image *in, const float8 coords, const float width, const float height, const float border_size) { // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image. // Sets the 4x4 coordinates for each of the four input texels const float8 fc = floor(coords); - const float16 c1 = (float16)( - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size), - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size)); - const float16 c2 = (float16)( - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size), - clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size)); + const float16 c1 = + (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size)); + const float16 c2 = + (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size)); // Loads the values from the input image const float16 t = (float16)( - /* tl, tr, bl, br */ - * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)), - *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)), - *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)), - *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)), - *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)), - *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)), - *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)), - *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf))); - const float8 a = coords - fc; - const float8 b = ((float8)(1.f)) - a; - const float4 fr = (float4)( - ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)), - ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)), - ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)), - ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7))); + /* tl, tr, bl, br */ + *((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)), + *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)), + *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)), + *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)), + *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)), + *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)), + *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)), + *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf))); + const float8 a = coords - fc; + const float8 b = ((float8)(1.f)) - a; + const float4 fr = + (float4)(((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)), + ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)), + ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)), + ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7))); return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4)); } @@ -126,7 +130,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const * @param[in] width Width of the image * @param[in] height Height of the image */ -inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height) +inline const VEC_DATA_TYPE(DATA_TYPE, 4) + bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height) { return bilinear_interpolate_with_border(in, coords, width, height, 1); } diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp index 2728958add..5b72354abe 100644 --- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp +++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,16 +45,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, DataType::U64); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, + DataType::U64); } return Status{}; @@ -66,22 +71,34 @@ CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op); } -void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape{ input->info()->tensor_shape() }; + TensorShape output_shape{input->info()->tensor_shape()}; output_shape.set(axis, 1); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(DataType::S32) + .reset_padding() + .set_is_resizable(true)); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -90,11 +107,14 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, // Set build options const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0)); - const auto vector_size = (adjusted_vector_size == 3U && axis == 0U) ? 2U : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16. + const auto vector_size = (adjusted_vector_size == 3U && axis == 0U) + ? 2U + : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16. CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(input->info()->dimension(0) % vector_size)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size)); build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE"); build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN"); @@ -104,7 +124,7 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, // Create kernel std::string kernel_axis_name; - switch(axis) + switch (axis) { case 0: build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); @@ -135,7 +155,10 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); return Status{}; @@ -146,7 +169,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - switch(_reduction_axis) + switch (_reduction_axis) { case 0: { @@ -154,7 +177,8 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) Window out_window(window); Window in_window(window); out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - in_window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + in_window.set(Window::DimX, + Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u)); // Get first input and output slices @@ -166,15 +190,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_2D_tensor_argument(idx, _input, in_slice); add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); + } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); } break; case 1: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); + Window window_in{window}; + window_in.set(Window::DimY, + Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); Window in_slice = window_in.first_slice_window_2D(); Window out_slice = window.first_slice_window_2D(); @@ -184,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_2D_tensor_argument(idx, _input, in_slice); add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } break; case 2: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); + Window window_in{window}; + window_in.set(Window::DimZ, + Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); Window in_slice = window_in.first_slice_window_3D(); Window out_slice = window.first_slice_window_3D(); @@ -202,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _input, in_slice); add_3D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); + } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); } break; case 3: { // Get first input and output slices - Window window_in{ window }; + Window window_in{window}; window_in.set(3, Window::Dimension(0, 1, 1)); Window in_slice = window_in.first_slice_window_4D(); Window out_slice = window.first_slice_window_4D(); @@ -220,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_4D_tensor_argument(idx, _input, in_slice); add_4D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); + } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); } break; default: diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h index 5f36bdf113..fb3b41b0de 100644 --- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h +++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -72,7 +73,11 @@ public: * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op); /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel. * @@ -84,7 +89,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp index 3fa8a8edaa..c88a852a44 100644 --- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp @@ -23,58 +23,64 @@ */ #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" using namespace arm_compute; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(epsilon); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); - if(beta != nullptr) + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); + if (beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); } - if(gamma != nullptr) + if (gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); } - if(act_info.enabled()) + if (act_info.enabled()) { ActivationLayerInfo::ActivationFunction act = act_info.activation(); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && + act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + act != + ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a()); } - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -86,14 +92,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output) { - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(16 / input->element_size(), input->dimension(0)); // Configure kernel window Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); bool window_changed = false; - if(output != nullptr) + if (output != nullptr) { AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); window_changed = update_window_and_padding(win, input_access, output_access); @@ -104,30 +111,50 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, window_changed = update_window_and_padding(win, input_access); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel() - : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false) + : _input(nullptr), + _output(nullptr), + _mean(nullptr), + _var(nullptr), + _beta(nullptr), + _gamma(nullptr), + _epsilon(0), + _run_in_place(false) { _type = CLKernelType::ELEMENTWISE; } -void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, - float epsilon, ActivationLayerInfo act_info) +void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info); } -void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, - const ICLTensor *gamma, - float epsilon, ActivationLayerInfo act_info) +void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta, + const ICLTensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var); - auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma }); + auto padding_info = get_padding_info({input, output, mean, var, beta, gamma}); _input = input; _output = output; _mean = mean; @@ -142,13 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info)); - unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); + unsigned int num_elems_processed_per_iteration = + adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); // Set build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); @@ -157,29 +186,33 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA"); // Create kernel - _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = + create_kernel(compile_context, + "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Set kernel static arguments unsigned int include_output = (!_run_in_place) ? 1 : 0; - unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters - if(_beta != nullptr) + unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor() + + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters + if (_beta != nullptr) { idx += num_arguments_per_1D_tensor(); // Skip beta parameter } - if(_gamma != nullptr) + if (_gamma != nullptr) { idx += num_arguments_per_1D_tensor(); // Skip gamma parameter } _kernel.setArg<cl_float>(idx++, _epsilon); - if(output != nullptr) + if (output != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); } // Configure kernel window - if(input->info()->data_layout() == DataLayout::NHWC) + if (input->info()->data_layout() == DataLayout::NHWC) { Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); @@ -205,18 +238,23 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ _config_id += lower_string(string_from_data_layout(input->info()->data_layout())); } -Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info)); - if(input->data_layout() != DataLayout::NHWC) + if (input->data_layout() != DataLayout::NHWC) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()) + .first); } return Status{}; @@ -236,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue unsigned int idx = (1 + include_output) * num_arguments_per_3D_tensor(); add_1D_tensor_argument(idx, _mean, vector_slice); add_1D_tensor_argument(idx, _var, vector_slice); - if(_beta != nullptr) + if (_beta != nullptr) { add_1D_tensor_argument(idx, _beta, vector_slice); } - if(_gamma != nullptr) + if (_gamma != nullptr) { add_1D_tensor_argument(idx, _gamma, vector_slice); } @@ -249,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue { idx = 0; add_3D_tensor_argument(idx, _input, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, _output, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h index acbe0f2a26..1a88d2a8c5 100644 --- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -64,7 +65,13 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f, + void configure(ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta = nullptr, + const ICLTensor *gamma = nullptr, + float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); /** Set the input and output tensors. * @@ -82,8 +89,15 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, - const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *var, + const ICLTensor *beta = nullptr, + const ICLTensor *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel * * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result. @@ -99,10 +113,14 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr, - float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta = nullptr, + const ITensorInfo *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp index 143a842d02..c640b5a8d6 100644 --- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp +++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp @@ -25,13 +25,14 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" -#include "arm_compute/core/TensorInfo.h" using namespace arm_compute::misc::shape_calculator; namespace arm_compute @@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status validate_arguments_static(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); @@ -66,10 +71,11 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); - const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = compute_batch_to_space_shape( + input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); + const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -79,8 +85,7 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape } } // namespace -CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() - : _input(nullptr), _block_shape(nullptr), _output(nullptr) +CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -90,11 +95,14 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output); } -void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output) +void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, block_shape, output }); + auto padding_info = get_padding_info({input, block_shape, output}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info())); @@ -106,8 +114,9 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3))); - _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); - + _kernel = create_kernel(compile_context, + "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -116,47 +125,65 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info) +void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info); } -void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, - const CropInfo &crop_info) +void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); + const TensorShape output_shape = compute_batch_to_space_shape( + input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); _input = input; _output = output; // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3))); build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x)); build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y)); build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left)); build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top)); - _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel( + compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); ICLKernel::configure_internal(win); } -Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output)); return Status{}; } -Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info)); @@ -185,7 +212,7 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu unsigned int idx = 0; add_4D_tensor_argument(idx, _input, slice_in); add_argument(idx, batch_id); - if(_block_shape != nullptr) + if (_block_shape != nullptr) { add_1D_tensor_argument(idx, _block_shape, vector_slice); } @@ -193,7 +220,6 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu enqueue(queue, *this, slice_out, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h index a05184cd5b..b9d3e66fe2 100644 --- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h +++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -65,7 +66,10 @@ public: * * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + ICLTensor *output); /** Initialise the kernel's inputs and output (Static block shape). * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -74,7 +78,11 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info); + void configure(const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info); /** Initialise the kernel's inputs and output (Static block shape). * * @param[in] compile_context The compile context to be used. @@ -84,7 +92,12 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + ICLTensor *output, + const CropInfo &crop_info); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -106,7 +119,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info); + static Status validate(const ITensorInfo *input, + const int32_t block_shape_x, + const int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp index 11e6d021a5..de3fb43de8 100644 --- a/src/core/CL/kernels/CLBitwiseKernel.cpp +++ b/src/core/CL/kernels/CLBitwiseKernel.cpp @@ -28,25 +28,29 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" namespace arm_compute { -CLBitwiseKernel::CLBitwiseKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op) +void CLBitwiseKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + BitwiseOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); - if(op != BitwiseOperation::NOT) + if (op != BitwiseOperation::NOT) { ARM_COMPUTE_ERROR_ON_NULLPTR(input2); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); @@ -56,7 +60,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I // Output auto inizialitation if not yet initialized auto_init_if_empty(*(output->info()), *(input1->info())); - auto padding_info = get_padding_info({ input1, input2, output }); + auto padding_info = get_padding_info({input1, input2, output}); // Configure kernel window const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0)); @@ -68,7 +72,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I // Create kernel std::string kernel_name = ""; - switch(op) + switch (op) { case BitwiseOperation::AND: kernel_name = "bitwise_and"; @@ -107,13 +111,12 @@ void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue) { unsigned int idx = 0; add_2D_tensor_argument(idx, _input1, slice); - if(_input2 != nullptr) + if (_input2 != nullptr) { add_2D_tensor_argument(idx, _input2, slice); } add_2D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h index c5a999643d..2c74955ae4 100644 --- a/src/core/CL/kernels/CLBitwiseKernel.h +++ b/src/core/CL/kernels/CLBitwiseKernel.h @@ -59,7 +59,11 @@ public: * @param[out] output Destination tensor. Data types supported: U8. * @param[in] op Bitwise operation to perform. Supported: AND, OR, NOT, XOR. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + BitwiseOperation op); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp index 72de854afb..f32c518e29 100644 --- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp +++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -40,7 +41,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status validate_arguments(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes); @@ -53,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2); const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16; - if(is_qasymm16) + if (is_qasymm16) { const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f); @@ -65,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas); } - if(pred_boxes->total_size() > 0) + if (pred_boxes->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes); ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2); - if(is_qasymm16) + if (is_qasymm16) { const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f); @@ -83,22 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe } } // namespace -CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() - : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr) +CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info); } -void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info) +void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); - auto padding_info = get_padding_info({ boxes, pred_boxes, deltas }); - auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info())); + auto padding_info = get_padding_info({boxes, pred_boxes, deltas}); + auto_init_if_empty(*pred_boxes->info(), deltas->info() + ->clone() + ->set_data_type(boxes->info()->data_type()) + .set_quantization_info(boxes->info()->quantization_info())); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info)); @@ -128,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale())); build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1"); - if(is_quantized) + if (is_quantized) { build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type())); const UniformQuantizationInfo boxes_qinfo = boxes->info()->quantization_info().uniform(); @@ -148,12 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor const unsigned int num_elems_processed_per_iteration = 4; - Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info)); return Status{}; diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h index 08f350e86a..9a1bb49bb9 100644 --- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h +++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h @@ -58,7 +58,10 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. * */ - void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info); + void configure(const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -71,7 +74,11 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *boxes, + ICLTensor *pred_boxes, + const ICLTensor *deltas, + const BoundingBoxTransformInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform * @@ -85,7 +92,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info); + static Status validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp index a2a0bc4fb4..ec58bf9e7a 100644 --- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp +++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient"); - const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); + const unsigned int channels = + input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + num_groups == channels, + "Channel shuffling with same number of groups as number of channels would be inefficient"); // There cannot be more groups than channels ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, + "The number of channels must be a multiple of the number of groups"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -70,11 +75,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen auto_init_if_empty(*output, *input->clone()); const bool is_nhwc = input->data_layout() == DataLayout::NHWC; - if(is_nhwc) + if (is_nhwc) { - unsigned int num_elems_processed_per_iteration_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x)); - Window win_collapsed = win.collapse(win, Window::DimZ); + unsigned int num_elems_processed_per_iteration_x = + adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x)); + Window win_collapsed = win.collapse(win, Window::DimZ); return std::make_pair(Status{}, win_collapsed); } else @@ -83,22 +89,25 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen constexpr unsigned int num_elems_processed_per_iteration_y = 2; // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + Window win = calculate_max_window( + *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, + num_elems_processed_per_iteration_y); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, + num_elems_processed_per_iteration_y); const bool window_changed = update_window_and_padding(win, input_access, output_access); Window win_collapsed = win.collapse(win, Window::DimZ); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win_collapsed); } } } // namespace -CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() - : _input(nullptr), _output(nullptr) +CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -108,23 +117,27 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups); } -void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups) +void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; - const DataLayout data_layout = input->info()->data_layout(); - const bool is_nhwc = data_layout == DataLayout::NHWC; - const unsigned int channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); - unsigned int vec_size_x = 0; - unsigned int vec_size_x_leftovers = 0; - if(is_nhwc) + const DataLayout data_layout = input->info()->data_layout(); + const bool is_nhwc = data_layout == DataLayout::NHWC; + const unsigned int channels = + input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); + unsigned int vec_size_x = 0; + unsigned int vec_size_x_leftovers = 0; + if (is_nhwc) { - vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); + vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; } else @@ -170,13 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont _config_id += support::cpp11::to_string(output->info()->dimension(1)); _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(2)); - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } } -Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) +Status +CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h index 31c007f17e..43c939ebd8 100644 --- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h +++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h @@ -60,7 +60,10 @@ public: * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int num_groups); /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel * * @param[in] input Input tensor info. Data types supported: All. diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp index f4d6316517..f27270733e 100644 --- a/src/core/CL/kernels/CLComparisonKernel.cpp +++ b/src/core/CL/kernels/CLComparisonKernel.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,14 +39,10 @@ namespace arm_compute namespace { // Create supported comparisons map -const std::map<ComparisonOperation, std::string> supported_comparison_ops = -{ - { ComparisonOperation::Equal, "EQUAL" }, - { ComparisonOperation::NotEqual, "NOTEQUAL" }, - { ComparisonOperation::Greater, "GREATER" }, - { ComparisonOperation::GreaterEqual, "GREATEREQUAL" }, - { ComparisonOperation::Less, "LESS" }, - { ComparisonOperation::LessEqual, "LESSEQUAL" }, +const std::map<ComparisonOperation, std::string> supported_comparison_ops = { + {ComparisonOperation::Equal, "EQUAL"}, {ComparisonOperation::NotEqual, "NOTEQUAL"}, + {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"}, + {ComparisonOperation::Less, "LESS"}, {ComparisonOperation::LessEqual, "LESSEQUAL"}, }; int calculate_num_elems_processed_per_iteration(const ITensorInfo &input) @@ -53,7 +50,10 @@ int calculate_num_elems_processed_per_iteration(const ITensorInfo &input) return 16 / input.element_size(); } -Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation) +Status validate_arguments(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output, + ComparisonOperation operation) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1); ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN); @@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured output - if(output.total_size() > 0) + if (output.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), @@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) { - const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1); // Auto initialize output if not initialized @@ -90,27 +90,34 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITe AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration); AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration); - bool window_changed = update_window_and_padding(win_input1, input1_access) - || update_window_and_padding(win_input2, input2_access) - || update_window_and_padding(win, output_access); + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace -CLComparisonKernel::CLComparisonKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparisonKernel::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation); } -void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation) +void CLComparisonKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation)); @@ -129,10 +136,11 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons // Set kernel build options std::set<std::string> build_opts; build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info()))); + build_opts.emplace("-DVEC_SIZE=" + + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info()))); build_opts.emplace("-DOP=" + operation_name); build_opts.emplace("-DOP_NAME=" + lower_string(operation_name)); - if(is_data_type_quantized(input1->info()->data_type())) + if (is_data_type_quantized(input1->info()->data_type())) { const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform(); const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform(); @@ -160,12 +168,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons _config_id += lower_string(string_from_data_layout(input1->info()->data_layout())); } -Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation) +Status CLComparisonKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first); return Status{}; } @@ -181,17 +193,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue) bool can_collapse = true; const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) { can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) { can_collapse = (in_shape1[d] == in_shape2[d]); } } bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; @@ -212,16 +225,16 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } BorderSize CLComparisonKernel::border_size() const { const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info()); - const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize{ 0, border, 0, 0 }; + const unsigned int replicateSize = + _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize{0, border, 0, 0}; } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h index 0b94190183..174a6c9bf9 100644 --- a/src/core/CL/kernels/CLComparisonKernel.h +++ b/src/core/CL/kernels/CLComparisonKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLCOMPARISONKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -64,7 +65,11 @@ public: * @param[out] output Destination tensor. Data types supported: U8. * @param[in] operation Comparison operation to use. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + ComparisonOperation operation); /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel * * @param[in] input1 Source tensor. Data types supported: All. @@ -74,10 +79,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation operation); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run(const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; private: diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp index 76af5d564a..f8ecc4c098 100644 --- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp +++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" @@ -40,7 +41,8 @@ CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel() _type = CLKernelType::ELEMENTWISE; } -Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, +Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, const PadStrideInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -60,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c)); - for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); } @@ -68,20 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co return Status{}; } -void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, - const PadStrideInfo &info) +void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), input, output, info); } -void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, - const PadStrideInfo &info) +void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -119,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu const int out_end_y = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1; const int out_step_y = _info.stride().second; - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: { @@ -137,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu add_3D_tensor_argument(idx, _input, slice_in); add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out)); + } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out)); break; } case DataLayout::NHWC: @@ -156,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu add_3D_tensor_argument(idx, _input, slice_in); add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); break; } default: diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h index e0d1322341..762989a836 100644 --- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h +++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h @@ -62,7 +62,10 @@ public: * @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PadStrideInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample * * @param[in] input Source tensor info. Data types supported: All. diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp index 0fc0ff8168..b33e0a8b6f 100644 --- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp +++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -38,7 +39,11 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info); @@ -53,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first); ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32); - if(!is_qasymm) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S32); + if (!is_qasymm) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info); } - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * + weights_info->dimension(idx_b)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b)); - if(bias != nullptr) + if (bias != nullptr) { - if(is_qasymm) + if (is_qasymm) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -76,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b)); } - if(output->total_size() != 0) + if (output->total_size() != 0) { const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info); + auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), + weights_info->dimension(idx_w), weights_info->dimension(idx_h), + stride_info); - const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); + const TensorShape output_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } return Status{}; } -std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, + ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -97,11 +111,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second); - auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info); + auto out_dims = + deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), + weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info); - const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); + const TensorShape output_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info); - auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info())); + auto_init_if_empty(*output, input->clone() + ->set_tensor_shape(output_shape) + .set_data_layout(data_layout) + .set_quantization_info(input->quantization_info())); Window win = calculate_max_window(*input); @@ -109,29 +129,37 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input } } // namespace -CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() - : _add_bias(false), - _bias(nullptr) +CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, +void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) { configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info); } -void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, - const ITensorInfo *weights_info, - const PadStrideInfo &deconv_info) +void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), + output->info(), input_info, weights_info, deconv_info)); - auto padding_info = get_padding_info({ input, bias, output }); + auto padding_info = get_padding_info({input, bias, output}); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info); + auto win_config = + validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); const DataLayout data_layout = input_info->data_layout(); @@ -178,7 +206,11 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, +Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info)); @@ -194,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu unsigned int idx = 0; add_3D_tensor_argument(idx, _input, collapsed); add_3D_tensor_argument(idx, _output, collapsed); - if(_add_bias) + if (_add_bias) { add_1D_tensor_argument(idx, _bias, collapsed); } diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h index ce354fa86f..8f436b07e3 100644 --- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h +++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h @@ -67,7 +67,12 @@ public: * @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input. * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported. */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info); + void configure(const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info); /** Initialise the kernel's source and destination. * * @param[in] compile_context The compile context to be used. @@ -79,8 +84,13 @@ public: * @param[in] weights_info Deconvolution weights tensor info. Supported data types: same as @p input. Supported data layouts: same as @p input. * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, - const PadStrideInfo &deconv_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *bias, + ICLTensor *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info); /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionReshapeOutputKernel. * @@ -93,7 +103,12 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const PadStrideInfo &deconv_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp index 5c1dc4fbf6..cdf19ab2e1 100644 --- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp +++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width])); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != + (block_shape * input->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != + (block_shape * input->tensor_shape()[idx_height])); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -63,8 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape() +CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape() { _type = CLKernelType::ELEMENTWISE; } @@ -74,14 +76,18 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); + TensorShape output_shape = + compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); @@ -98,7 +104,9 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel))); build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape)); build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width))); - _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel(compile_context, + "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); @@ -137,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu enqueue(queue, *this, slice_in, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_in)); + } while (window.slide_window_slice_3D(slice_in)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h index 1f7f77b569..cef70c4dda 100644 --- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h +++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -61,7 +62,8 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] block_shape Block shape value. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); + void + configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel. * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index e34b6929e7..b95abe795f 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -23,16 +23,17 @@ */ #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/ICLKernel.h" @@ -45,12 +46,18 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ARM_COMPUTE_UNUSED(dwc_info); bool in_place = false; - if(output == nullptr || output == input) + if (output == nullptr || output == input) { in_place = true; output = input; @@ -58,11 +65,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1); ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_to_cl_image(weights) == false), "Weights cannot be exported to cl_image!"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && + (export_to_cl_image(weights) == false), + "Weights cannot be exported to cl_image!"); ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0)); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1); @@ -72,33 +82,40 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier)); // In place restrictions - if(in_place) + if (in_place) { - const int weights_width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U); + const int weights_width_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const int weights_height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || + weights->tensor_shape()[weights_height_idx] != 1U); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U)); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it + ARM_COMPUTE_RETURN_ERROR_ON( + conv_info.pad_stride_info + .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it } - const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation }; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info); + const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), + conv_info.dilation}; + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info); - if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1) + if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1) { ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0); } const bool is_quantized = is_data_type_quantized(input->data_type()); - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -108,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } } - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); @@ -116,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0)); @@ -134,22 +151,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); } - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } - if(is_data_type_quantized(input->data_type())) + if (is_data_type_quantized(input->data_type())) { const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; + const UniformQuantizationInfo oq_info = + (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); } return Status{}; @@ -171,30 +190,48 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel _type = CLKernelType::DEPTHWISE; } -void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) +void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers, + const ICLTensor *output_shifts) { - configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts); + configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, + output_multipliers, output_shifts); } -void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) +void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers, + const ICLTensor *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - if(output == nullptr) + if (output == nullptr) { // In-place output = input; } - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), - dwc_info, conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info, + conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, + (output_shifts != nullptr) ? output_shifts->info() : nullptr)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info); - auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info())); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape( + *(input->info()), *(weights->info()), conv_info); + auto_init_if_empty(*(output->info()), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_quantization_info(output->info()->quantization_info())); _input = input; _output = output; @@ -214,12 +251,12 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & CLBuildOptions build_opts; // Update the padding for the input/weights tensor if we can export to cl_image - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info()); } - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info()); } @@ -229,9 +266,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & const auto act_function = conv_info.act_info.activation(); const auto dst_data_type = _output->info()->data_type(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -268,23 +306,24 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1)); - build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0)); + build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", + "-DN0_A=" + support::cpp11::to_string(n0)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0)); build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION"); // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll - set_unroll_with_pragma(build_opts, { static_cast<int>(_weights->info()->dimension(1) + m0 - 1), - static_cast<int>(_weights->info()->dimension(1)), - static_cast<int>(_weights->info()->dimension(2)) - }); + set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1), + static_cast<int>(_weights->info()->dimension(1)), + static_cast<int>(_weights->info()->dimension(2))}); - if(biases != nullptr) + if (biases != nullptr) { build_opts.add_option(std::string("-DHAS_BIAS")); - build_opts.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type()))); + build_opts.add_option( + std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type()))); } - if(_is_quantized) + if (_is_quantized) { kernel_name = "dwc_native_quantized_nhwc"; const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform(); @@ -306,13 +345,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32)); build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32)); - build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + get_cl_type_from_data_type(_output_multipliers->info()->data_type())); - build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + get_cl_type_from_data_type(_output_shifts->info()->data_type())); - build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR"); + build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + + get_cl_type_from_data_type(_output_multipliers->info()->data_type())); + build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + + get_cl_type_from_data_type(_output_shifts->info()->data_type())); + build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, + "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR"); // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach int a_val{}; int b_val{}; - std::tie(b_val, a_val) = get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo); + std::tie(b_val, a_val) = + get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo); build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val)); build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val)); @@ -321,8 +364,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & { kernel_name = "dwc_native_fp_nhwc"; build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a())); - build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b())); + build_opts.add_option_if(conv_info.act_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a())); + build_opts.add_option_if(conv_info.act_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b())); } Window win = calculate_max_window(*(output->info()), Steps(n0, m0)); @@ -350,10 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & _config_id += string_from_data_type(input->info()->data_type()); } -Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts)); return Status{}; } @@ -370,47 +422,52 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm cl::Image2D input_cl_image; cl::Image2D weights_cl_image; - if(_export_input_to_cl_image || _export_weights_to_cl_image) + if (_export_input_to_cl_image || _export_weights_to_cl_image) { // Export cl_buffer to cl_image - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { - const size_t image_w = _input->info()->dimension(0) / 4; - const size_t image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3); + const size_t image_w = _input->info()->dimension(0) / 4; + const size_t image_h = + _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = _input->info()->strides_in_bytes()[1]; - input_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + input_cl_image = + create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, + _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { - const size_t image_w = _weights->info()->dimension(0) / 4; - const size_t image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3); + const size_t image_w = _weights->info()->dimension(0) / 4; + const size_t image_h = + _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = _weights->info()->strides_in_bytes()[1]; - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch, - CLImage2DType::ReadOnly); + weights_cl_image = + create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, + _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } } unsigned int idx = 0; - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { _kernel.setArg(idx++, input_cl_image); } add_4d_tensor_nhwc_argument(idx, _input); add_4d_tensor_nhwc_argument(idx, _output); - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } add_4d_tensor_nhwc_argument(idx, _weights); - if(_is_quantized) + if (_is_quantized) { add_1D_tensor_argument(idx, _output_multipliers, slice); add_1D_tensor_argument(idx, _output_shifts, slice); } - if(_biases != nullptr) + if (_biases != nullptr) { add_1D_tensor_argument(idx, _biases, slice); } diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h index 8eee7b2500..d34a662966 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h @@ -24,11 +24,11 @@ #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/function_info/ConvolutionInfo.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { class ICLTensor; @@ -74,15 +74,28 @@ public: * * no padding * * no change of data layout after configure */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers = nullptr, + const ICLTensor *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel * * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure() */ - void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); + void configure(ICLTensor *input, + const ICLTensor *weights, + const ICLTensor *biases, + ICLTensor *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ICLTensor *output_multipliers = nullptr, + const ICLTensor *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel * @@ -90,23 +103,29 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info, - const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const DWCComputeKernelInfo &dwc_info, + const ConvolutionInfo &conv_info, + const ITensorInfo *output_multipliers = nullptr, + const ITensorInfo *output_shifts = nullptr); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input {}; + const ICLTensor *_input{}; const ICLTensor *_weights{}; const ICLTensor *_biases{}; ICLTensor *_output{}; - unsigned int _depth_multiplier{ 0 }; + unsigned int _depth_multiplier{0}; const ICLTensor *_output_multipliers{}; const ICLTensor *_output_shifts{}; - bool _export_input_to_cl_image{ false }; - bool _export_weights_to_cl_image{ true }; - bool _is_quantized{ false }; + bool _export_input_to_cl_image{false}; + bool _export_weights_to_cl_image{true}; + bool _is_quantized{false}; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */ diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp index 9b514ed705..3d8f875ef7 100644 --- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp +++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -37,17 +38,20 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -57,7 +61,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *output, + ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_UNUSED(idx, config); @@ -69,21 +76,27 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() - : _input(nullptr), _output(nullptr), _idx(nullptr) +CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config) +void CLFFTDigitReverseKernel::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *idx, + const FFTDigitReverseKernelInfo &config) { configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config); } -void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config) +void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx); - auto padding_info = get_padding_info({ input, output, idx }); + auto padding_info = get_padding_info({input, output, idx}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config)); _input = input; @@ -114,10 +127,14 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); return Status{}; } @@ -137,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _output, slice); add_1D_tensor_argument(idx, _idx, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h index e5583a4c22..fdd1bcc3d3 100644 --- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h +++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H #define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { // Forward declarations @@ -56,7 +56,8 @@ public: * @param[in] idx Digit reverse index tensor. Data type supported: U32 * @param[in] config Kernel configuration. */ - void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config); + void + configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -65,7 +66,11 @@ public: * @param[in] idx Digit reverse index tensor. Data type supported: U32 * @param[in] config Kernel configuration. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *idx, + const FFTDigitReverseKernelInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel * * @param[in] input Source tensor info. Data types supported: F16/F32. @@ -75,7 +80,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp index 95f4b640bd..3729e6b77d 100644 --- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp +++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,11 +47,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -59,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) { - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output, *input); } @@ -76,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -CLFFTRadixStageKernel::CLFFTRadixStageKernel() - : _input(nullptr), _output(nullptr), _run_in_place(false) +CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false) { _type = CLKernelType::ELEMENTWISE; } @@ -87,11 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config) +void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTRadixStageKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); - auto padding_info = get_padding_info({ input, output }); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -110,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set static arguments if not the first stage - if(!config.is_first_stage) + if (!config.is_first_stage) { const unsigned int Ni = config.Nx * config.radix; const float exp_const = (-2.0 * M_PI) / static_cast<float>(Ni); - unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters + unsigned int idx = + (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters _kernel.setArg<cl_uint>(idx++, config.Nx); _kernel.setArg<cl_uint>(idx++, Ni); _kernel.setArg<cl_float>(idx, exp_const); @@ -136,21 +142,22 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config) +Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const FFTRadixStageKernelInfo &config) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (run_in_place) ? nullptr : output->clone().get(), - config) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config) + .first); return Status{}; } std::set<unsigned int> CLFFTRadixStageKernel::supported_radix() { - return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 }; + return std::set<unsigned int>{2, 3, 4, 5, 7, 8}; } void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue) @@ -165,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue) { unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, _output, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h index 9bb310db83..de80bfced3 100644 --- a/src/core/CL/kernels/CLFFTRadixStageKernel.h +++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H #define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + #include <set> namespace arm_compute @@ -69,7 +69,10 @@ public: * @param[out] output Destination tensor. Can be nullptr. Data type supported: same as @p input * @param[in] config FFT descriptor metadata. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTRadixStageKernelInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel * * @param[in] input Source tensor info. Data types supported: F16/F32. diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp index 8a714d71bf..be6e16b074 100644 --- a/src/core/CL/kernels/CLFFTScaleKernel.cpp +++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -54,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) } } // namespace -CLFFTScaleKernel::CLFFTScaleKernel() - : _input(nullptr), _output(nullptr), _run_in_place(false) +CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false) { _type = CLKernelType::ELEMENTWISE; } @@ -65,11 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS configure(CLKernelLibrary::get().get_compile_context(), input, output, config); } -void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config) +void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTScaleKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -78,20 +81,22 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen // Create kernel CLBuildOptions build_opts; build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() + : input->info()->num_channels())); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option_if(config.conjugate, "-DCONJ"); std::string kernel_name = "fft_scale_conj"; _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set static arguments - unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters + unsigned int idx = + (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters _kernel.setArg<cl_float>(idx, config.scale); // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); - if(output != nullptr) + if (output != nullptr) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); @@ -130,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue) { unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, _output, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h index cc518be193..b995282e02 100644 --- a/src/core/CL/kernels/CLFFTScaleKernel.h +++ b/src/core/CL/kernels/CLFFTScaleKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H #define ARM_COMPUTE_CLFFTSCALEKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { // Forward declarations @@ -63,7 +63,10 @@ public: * @param[out] output Destination tensor. Data type supported: same as @p input * @param[in] config Kernel configuration */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + const FFTScaleKernelInfo &config); /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel * * @param[in] input Source tensor info. Data types supported: F16/F32. diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp index fcd99a4ed9..86bb502da3 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.cpp +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -31,14 +31,14 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" #include "support/StringSupport.h" namespace arm_compute { -CLFillBorderKernel::CLFillBorderKernel() - : ICLKernel(), _tensor(nullptr) +CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -56,27 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue ICLKernel::add_argument<T>(idx, static_cast<T>(value)); } -void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void CLFillBorderKernel::configure(ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value); } -void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void CLFillBorderKernel::configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { _tensor = tensor; configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value); } -void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void CLFillBorderKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON(tensor == nullptr); ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1); - auto padding_info = get_padding_info({ tensor }); + auto padding_info = get_padding_info({tensor}); border_size.limit(tensor->padding()); // If there is no border: early exit - if(border_size.empty() || border_mode == BorderMode::UNDEFINED) + if (border_size.empty() || border_mode == BorderMode::UNDEFINED) { return; } @@ -98,25 +109,22 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Create static kernel arguments - const unsigned int valid_width = tensor->valid_region().shape[0]; - const unsigned int valid_height = tensor->valid_region().shape[1]; - const cl_int2 valid_region_coords = - { - { - static_cast<cl_int>(tensor->valid_region().anchor[0]), - static_cast<cl_int>(tensor->valid_region().anchor[1]), - } - }; - const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; + const unsigned int valid_width = tensor->valid_region().shape[0]; + const unsigned int valid_height = tensor->valid_region().shape[1]; + const cl_int2 valid_region_coords = {{ + static_cast<cl_int>(tensor->valid_region().anchor[0]), + static_cast<cl_int>(tensor->valid_region().anchor[1]), + }}; + const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; // Set static kernel arguments unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters ICLKernel::add_argument<cl_uint>(idx, valid_width); ICLKernel::add_argument<cl_uint>(idx, valid_height); ICLKernel::add_argument<cl_int2>(idx, valid_region_coords); - if(BorderMode::CONSTANT == border_mode) + if (BorderMode::CONSTANT == border_mode) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -175,12 +183,13 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { // Border mode undefined or border width == 0 - if(_kernel() == nullptr) + if (_kernel() == nullptr) { return; } - const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto tensor = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); @@ -193,14 +202,13 @@ void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl:: unsigned int idx = 0; add_3D_tensor_argument(idx, tensor, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) { // Border mode undefined or border width == 0 - if(_kernel() == nullptr) + if (_kernel() == nullptr) { return; } @@ -216,7 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _tensor, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h index 7951f48171..5782143cf9 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.h +++ b/src/core/CL/kernels/CLFillBorderKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -57,7 +58,11 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(const CLCompileContext &compile_context, + ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Initialise the kernel's input, output and border mode. * * @param[in,out] tensor Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32. @@ -65,7 +70,10 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ICLTensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Initialise the kernel's input, output and border mode. * * @param[in] compile_context The compile context to be used. @@ -74,7 +82,11 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Function to set the constant value on fill border kernel depending on type. * diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp index 68fe324df6..7da0679ae4 100644 --- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp +++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp @@ -30,20 +30,26 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status validate_arguments(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_UNUSED(epsilon); ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); @@ -54,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr); ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1); - if(fbn_type == FuseBatchNormalizationType::CONVOLUTION) + if (fbn_type == FuseBatchNormalizationType::CONVOLUTION) { ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0)); } else { - const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); + const size_t channel_idx = + get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0)); } // Validate bias - if(input_bias != nullptr) + if (input_bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias); } // Validate beta - if(bn_beta != nullptr) + if (bn_beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta); } // Validate gamma - if(bn_gamma != nullptr) + if (bn_gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma); } // Validate output weights - if(fused_weights != nullptr && fused_weights->total_size() != 0) + if (fused_weights != nullptr && fused_weights->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights); } // Validate output bias - if(fused_bias != nullptr && fused_bias->total_size() != 0) + if (fused_bias != nullptr && fused_bias->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias); @@ -101,28 +108,52 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b } // namespace CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel() - : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(), - _run_in_place_weights(false), _run_in_place_bias(false) + : _input_weights(nullptr), + _input_bias(nullptr), + _bn_mean(nullptr), + _bn_var(nullptr), + _bn_gamma(nullptr), + _bn_beta(nullptr), + _fused_weights(nullptr), + _fused_bias(nullptr), + _epsilon(), + _run_in_place_weights(false), + _run_in_place_bias(false) { _type = CLKernelType::ELEMENTWISE; } -void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } -void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, - ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias, + const ICLTensor *bn_beta, + const ICLTensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); - auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma }); + auto padding_info = + get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma}); _input_weights = input_weights; _input_bias = input_bias; @@ -135,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c _epsilon = epsilon; _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights); - _run_in_place_bias = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias); + _run_in_place_bias = + (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias); // Auto initialize outputs - if(_fused_weights != nullptr) + if (_fused_weights != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone()); } - if(_fused_bias != nullptr) + if (_fused_bias != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone()); } // Validate arguments - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(), - (fused_weights != nullptr) ? fused_weights->info() : nullptr, - (fused_bias != nullptr) ? fused_bias->info() : nullptr, - (input_bias != nullptr) ? input_bias->info() : nullptr, - (bn_beta != nullptr) ? bn_beta->info() : nullptr, - (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, - epsilon, fbn_type)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_weights->info(), bn_mean->info(), bn_var->info(), + (fused_weights != nullptr) ? fused_weights->info() : nullptr, + (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr, + (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon, + fbn_type)); // Configure kernel window Window win = calculate_max_window(*input_weights->info()); @@ -165,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c // Set build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type())); - build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2))); + build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, + "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2))); build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon)); build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W"); @@ -180,12 +212,19 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); return Status{}; } @@ -202,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl:: // Add kernel arguments unsigned int idx = 0; add_3D_tensor_argument(idx, _input_weights, slice_3d); - if(_input_bias != nullptr) + if (_input_bias != nullptr) { add_1D_tensor_argument(idx, _input_bias, slice_1d); } add_1D_tensor_argument(idx, _bn_mean, slice_1d); add_1D_tensor_argument(idx, _bn_var, slice_1d); - if(!_run_in_place_weights) + if (!_run_in_place_weights) { add_3D_tensor_argument(idx, _fused_weights, slice_3d); } - if(!_run_in_place_bias) + if (!_run_in_place_bias) { add_1D_tensor_argument(idx, _fused_bias, slice_1d); } - if(_bn_beta != nullptr) + if (_bn_beta != nullptr) { add_1D_tensor_argument(idx, _bn_beta, slice_1d); } - if(_bn_gamma != nullptr) + if (_bn_gamma != nullptr) { add_1D_tensor_argument(idx, _bn_gamma, slice_1d); } diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h index 78b1e74cab..76ec7a759f 100644 --- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h +++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h @@ -62,9 +62,16 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION. */ - void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias = nullptr, + const ICLTensor *bn_beta = nullptr, + const ICLTensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Set the source, destination of the kernel * * @param[in] compile_context The compile context to be used. @@ -81,9 +88,17 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias, - const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input_weights, + const ICLTensor *bn_mean, + const ICLTensor *bn_var, + ICLTensor *fused_weights, + ICLTensor *fused_bias, + const ICLTensor *input_bias = nullptr, + const ICLTensor *bn_beta = nullptr, + const ICLTensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel * * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC @@ -101,10 +116,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + static Status validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias = nullptr, + const ITensorInfo *bn_beta = nullptr, + const ITensorInfo *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp index 5495023b80..c11a18940a 100644 --- a/src/core/CL/kernels/CLGatherKernel.cpp +++ b/src/core/CL/kernels/CLGatherKernel.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLGatherKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -34,7 +36,8 @@ namespace arm_compute { namespace { -inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) +inline Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); @@ -43,11 +46,12 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->tensor_shape(), indices->tensor_shape(), actual_axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } @@ -56,12 +60,14 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); // Output auto initialization if not yet initialized - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->tensor_shape(), indices->tensor_shape(), actual_axis); auto_init_if_empty((*output), output_shape, 1, input->data_type()); // Create window @@ -72,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace -CLGatherKernel::CLGatherKernel() - : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) +CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) { _type = CLKernelType::ELEMENTWISE; } @@ -83,10 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis); } -void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) +void CLGatherKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); - auto padding_info = get_padding_info({ input, output, indices }); + auto padding_info = get_padding_info({input, output, indices}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis)); // Configure kernel window @@ -100,7 +109,8 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC // Set build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2))); build_opts.add_option("-DINDICES_DIM_Z=" + support::cpp11::to_string(indices->info()->dimension(2))); build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); @@ -114,10 +124,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) +Status +CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first); return Status{}; } diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h index 8f472a4696..db4b49d2f5 100644 --- a/src/core/CL/kernels/CLGatherKernel.h +++ b/src/core/CL/kernels/CLGatherKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLGATHERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -63,7 +64,11 @@ public: * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0 */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + int axis = 0); /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel * @@ -74,7 +79,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0); + static Status + validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp index 088c454f3c..b9ff72b928 100644 --- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp +++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2); - if(all_anchors->total_size() > 0) + if (all_anchors->total_size() > 0) { size_t feature_height = info.feat_height(); size_t feature_width = info.feat_width(); @@ -57,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors); - if(is_data_type_quantized(anchors->data_type())) + if (is_data_type_quantized(anchors->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors); } @@ -66,21 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc } } // namespace -CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() - : _anchors(nullptr), _all_anchors(nullptr) +CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) +void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, + ICLTensor *all_anchors, + const ComputeAnchorsInfo &info) { configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info); } -void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info) +void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *anchors, + ICLTensor *all_anchors, + const ComputeAnchorsInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors); - auto padding_info = get_padding_info({ anchors, all_anchors }); + auto padding_info = get_padding_info({anchors, all_anchors}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info)); // Metadata @@ -91,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex // Initialize the output if empty const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors); - auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); + auto_init_if_empty(*all_anchors->info(), + TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); // Set instance variables _anchors = anchors; @@ -108,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors)); build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi())); - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform(); build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); @@ -116,8 +122,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex } // Create kernel - const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + const std::string kernel_name = + (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields). // This means we don't need to pad on the X dimension, as we know in advance how many fields @@ -127,7 +134,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info) +Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, + const ITensorInfo *all_anchors, + const ComputeAnchorsInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info)); return Status{}; diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h index d26795ac7d..e08f281d6c 100644 --- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h +++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h @@ -62,7 +62,10 @@ public: * @param[in] info Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *anchors, + ICLTensor *all_anchors, + const ComputeAnchorsInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel * @@ -81,5 +84,5 @@ private: const ICLTensor *_anchors; ICLTensor *_all_anchors; }; -} // arm_compute +} // namespace arm_compute #endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp index 7ed323c950..b13eb16556 100644 --- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -39,17 +40,20 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); } return Status{}; @@ -59,27 +63,30 @@ Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *o { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); } return Status{}; } } // namespace -CLComputeMeanVariance::CLComputeMeanVariance() - : _input(nullptr), _output(nullptr) +CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision) +void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + bool use_mixed_precision) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output == nullptr ? input : output; @@ -88,7 +95,8 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); CLBuildOptions build_opts; - build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.add_option("-DINTERNAL_DATA_TYPE=" + + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type()))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); @@ -108,7 +116,7 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I const TensorShape out_shape(input_channel, 2u, input_batches); // Output auto initialization if not yet initialized - if(use_mixed_precision) + if (use_mixed_precision) { auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32); } @@ -134,7 +142,7 @@ void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue) Window collapsed_window = window.collapse(window, Window::DimZ); // We will process the planes together - if(_input->info()->data_layout() == DataLayout::NCHW) + if (_input->info()->data_layout() == DataLayout::NCHW) { collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -157,10 +165,14 @@ CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info) +void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *mean_var, + ICLTensor *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output == nullptr ? input : output; @@ -172,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision + ? "float" + : get_cl_type_from_data_type(input->info()->data_type()))); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1))); @@ -188,7 +202,7 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(1)); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type()); } @@ -197,7 +211,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info) +Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info)); return Status{}; @@ -211,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu Window collapsed_window = window.collapse(window, Window::DimZ); // We will process the planes together - if(_input->info()->data_layout() == DataLayout::NCHW) + if (_input->info()->data_layout() == DataLayout::NCHW) { collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -226,7 +242,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu add_4D_tensor_argument(idx, _input, collapsed_window); add_3D_tensor_argument(idx, _mean, collapsed_window); - if(!_run_in_place) + if (!_run_in_place) { add_4D_tensor_argument(idx, _output, collapsed_window); } diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h index 2f9014a651..9f436da7f6 100644 --- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h +++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H -#include "src/core/CL/ICLKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/CL/ICLKernel.h" + namespace arm_compute { // Forward declarations @@ -59,7 +59,11 @@ public: * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. * @param[in] info Kernel meta-data descriptor */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *mean_var, + ICLTensor *output, + const InstanceNormalizationLayerKernelInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer. * @@ -69,7 +73,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -106,7 +111,8 @@ public: * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. * @param[in] use_mixed_precision Use mixed precision in case of FP16 execution */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision); + void + configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision); /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer. * diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp index 542d380e4a..9ed9d7c5b0 100644 --- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp +++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp @@ -31,10 +31,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute @@ -43,7 +43,8 @@ namespace { constexpr int max_input_tensor_dim = 3; -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_UNUSED(epsilon); @@ -53,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, + "Actual normalization axis greater than max number of dimensions"); // Reduce shape on axis TensorShape sum_shape = input->tensor_shape(); sum_shape.set(actual_axis, 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -78,16 +80,22 @@ CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayerKernel::configure( + const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon) { configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon); } -void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon) +void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *sum, + ICLTensor *output, + int axis, + float epsilon) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon)); - auto padding_info = get_padding_info({ input, sum, output }); + auto padding_info = get_padding_info({input, sum, output}); _input = input; _sum = sum; @@ -95,8 +103,9 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context _actual_axis = wrap_around(axis, max_input_tensor_dim); _epsilon = epsilon; - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); - const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; + const unsigned int vec_size_x = + adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); + const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; // Set build options CLBuildOptions build_opts; @@ -107,7 +116,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context // Create kernel std::string kernel_name; unsigned int idx = 0; - switch(_actual_axis) + switch (_actual_axis) { case 0: kernel_name = "l2_normalize_x"; @@ -127,7 +136,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set epsilon argument - if(input->info()->data_type() == DataType::F32) + if (input->info()->data_type() == DataType::F32) { _kernel.setArg<cl_float>(idx, _epsilon); } @@ -146,7 +155,8 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status CLL2NormalizeLayerKernel::validate( + const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon)); return Status{}; @@ -159,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue Window window_sum(window); - switch(_actual_axis) + switch (_actual_axis) { case 0: { @@ -173,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue add_2D_tensor_argument(idx, _sum, sum_slice); add_2D_tensor_argument(idx, _output, in_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); } break; case 1: @@ -189,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue add_2D_tensor_argument(idx, _sum, sum_slice); add_2D_tensor_argument(idx, _output, in_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); } break; case 2: @@ -205,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue add_3D_tensor_argument(idx, _sum, sum_slice); add_3D_tensor_argument(idx, _output, in_slice); enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice)); + } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice)); } break; default: diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h index edc0585217..5c9ab94ce5 100644 --- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h +++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -70,7 +71,12 @@ public: * @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2 * @param[in] epsilon Lower bound value for the normalization. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *sum, + ICLTensor *output, + int axis, + float epsilon); /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel. * @@ -84,7 +90,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); + static Status + validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp index dc9d68626d..e560f1de4a 100644 --- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,26 +43,31 @@ using namespace misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices); - int pool_stride_x = 0; - int pool_stride_y = 0; - PoolingType pool_type = pool_info.pool_type; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + int pool_stride_x = 0; + int pool_stride_y = 0; + PoolingType pool_type = pool_info.pool_type; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_size_x = pool_info.pool_size.width; - const int pool_size_y = pool_info.pool_size.height; + const int pool_size_x = pool_info.pool_size.width; + const int pool_size_y = pool_info.pool_size.height; const Size2D pool_size(pool_size_x, pool_size_y); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -71,17 +77,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() - : _input(nullptr), _output(nullptr), _indices(nullptr) +CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr) { _type = CLKernelType::POOL; } -void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info) +void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info())); - auto padding_info = get_padding_info({ input, indices, output }); + auto padding_info = get_padding_info({input, indices, output}); _input = input; _output = output; @@ -119,7 +128,10 @@ void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices)); @@ -140,7 +152,6 @@ void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queu add_3D_tensor_argument(idx, _output, slice); add_3D_tensor_argument(idx, _indices, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h index 45481d0507..eb18a46784 100644 --- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h +++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h @@ -59,7 +59,11 @@ public: * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *indices, + ICLTensor *output, + const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -72,7 +76,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info); // Inherited methods overridden void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp index ac33468ad8..8632bdf623 100644 --- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp +++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -69,15 +70,19 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon); } -void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon) +void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output, + float epsilon) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); _run_in_place = (output == nullptr) || (output == input); - ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); + ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate( + input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), *input->info()); } @@ -85,7 +90,8 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_ _input = input; _output = output; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); // Set build options CLBuildOptions build_opts; @@ -134,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h index a1ba2b905e..e02a3c58a3 100644 --- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h +++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h @@ -66,7 +66,10 @@ public: * @param[out] output (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8. */ - void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f); + void configure(const CLCompileContext &compile_context, + ICLTensor *input, + ICLTensor *output = nullptr, + float epsilon = 1e-8f); /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel * * @param[in] input Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr, diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp index c6c4229c00..b636c485e7 100644 --- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Window.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -63,7 +64,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output, *input->clone()); @@ -71,9 +73,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen bool window_changed = false; Window win; const DataLayout data_layout = input->data_layout(); - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); + const unsigned int vec_size_x = + adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); const unsigned int norm_idx = get_normalization_dimension_index(input->data_layout(), norm_info); const bool is_norm_across_width = norm_idx == 0; @@ -87,15 +90,16 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen // The output has 1 right padding because of the vec_size_x. // The input has 1 left padding because radius = 1. // The input has 2 right padding because of radius = 1 AND because of the extra output padding - const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0; - const unsigned int border_width_right = is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0; - const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left); + const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0; + const unsigned int border_width_right = + is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0; + const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left); win = calculate_max_window(*input, Steps(vec_size_x)); // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding // Reads can occur within the valid region of the input - if(is_norm_across_width) + if (is_norm_across_width) { AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0); window_changed = window_changed || update_window_and_padding(win, input_access); @@ -112,13 +116,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen else { unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0)); - if(norm_info.is_cross_map()) + if (norm_info.is_cross_map()) { vec_size_x = 1; } win = calculate_max_window(*input, Steps(vec_size_x)); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -139,10 +144,13 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info); } -void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info) +void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + NormalizationLayerInfo norm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info)); @@ -152,16 +160,17 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte _input = input; _output = output; - const DataLayout data_layout = input->info()->data_layout(); - unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); - int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; - if(norm_info.is_cross_map() && data_layout == DataLayout::NHWC) + const DataLayout data_layout = input->info()->data_layout(); + unsigned int vec_size_x = + adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0)); + int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x; + if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC) { vec_size_x = 1; vec_size_x_leftovers = 0; } - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { const unsigned int norm_idx = get_normalization_dimension_index(data_layout, norm_info); _is_norm_across_width = norm_idx == 0; @@ -175,9 +184,10 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte // The output has 1 right padding because of the vec_size_x. // The input has 1 left padding because radius = 1. // The input has 2 right padding because of radius = 1 AND the extra output padding - const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0; - const unsigned int border_width_right = _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0; - _border_size = BorderSize(0, border_width_right, 0, border_width_left); + const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0; + const unsigned int border_width_right = + _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0; + _border_size = BorderSize(0, border_width_right, 0, border_width_left); } const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D); @@ -193,12 +203,14 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2))); build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2)))); build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D"); - build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0))); - build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), + "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, + "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1))); // Create kernel std::string kernel_name; - if(norm_info.is_in_map()) + if (norm_info.is_in_map()) { kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout)); } @@ -222,16 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte _config_id += support::cpp11::to_string(input->info()->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(input->info()->dimension(1)); - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } } -Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info) +Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + NormalizationLayerInfo norm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first); return Status{}; } @@ -251,7 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h index 739a2ae9f1..5517ba6904 100644 --- a/src/core/CL/kernels/CLNormalizationLayerKernel.h +++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h @@ -63,7 +63,10 @@ public: * Data layouts supported: same as @p input. * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + NormalizationLayerInfo norm_info); /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], @@ -77,7 +80,7 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run(const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; private: diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp index 6b0400d50e..59352a8fb7 100644 --- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp +++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp @@ -31,32 +31,35 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std); ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors"); - const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int channel_idx = + get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0)); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -77,7 +80,8 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, bool window_changed = update_window_and_padding(win, input_access, output_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -88,12 +92,19 @@ CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std); } -void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std) +void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std); @@ -102,7 +113,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ // Output tensor auto initialization if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; @@ -112,9 +123,10 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ const DataLayout data_layout = input->info()->data_layout(); // Get number of elements to process per iterations - const unsigned int num_elems_processed_per_iteration = (data_layout == DataLayout::NHWC) ? adjust_vec_size(16 / input->info()->element_size(), - input->info()->dimension(0)) : - (16 / input->info()->element_size()); + const unsigned int num_elems_processed_per_iteration = + (data_layout == DataLayout::NHWC) + ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)) + : (16 / input->info()->element_size()); const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); const DataType dt = input->info()->data_type(); @@ -122,11 +134,12 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ CLBuildOptions build_opts; build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); - build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration))); + build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration))); build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx)))); std::string kernel_name = "normalize_planar_yuv_layer_"; - if(is_data_type_quantized(dt)) + if (is_data_type_quantized(dt)) { const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform(); build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset))); @@ -139,7 +152,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); ICLKernel::configure_internal(win); @@ -165,12 +178,16 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_ _config_id += support::cpp11::to_string(input->info()->dimension(2)); } -Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std) +Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *std) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std)); - if(input->data_layout() == DataLayout::NCHW) + if (input->data_layout() == DataLayout::NCHW) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first); } return Status{}; } @@ -196,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h index 6db4433e78..341b404e3d 100644 --- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h +++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h @@ -67,7 +67,11 @@ public: * @param[in] std Standard deviation values tensor. 1 dimension with size equal to the number of input channels. * Data types supported: same as @p input */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *mean, + const ICLTensor *std); /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel * * @param[in] input Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels]. @@ -79,7 +83,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp index 53f313c0d3..0ac285038e 100644 --- a/src/core/CL/kernels/CLPadLayerKernel.cpp +++ b/src/core/CL/kernels/CLPadLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -35,25 +36,29 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(constant_value); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions())); - if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC) + if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC) { ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3); const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT); - for(size_t i = 0; i < padding.size(); ++i) + for (size_t i = 0; i < padding.size(); ++i) { ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect)); ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect)); } } - if(output->total_size() > 0) + if (output->total_size() > 0) { TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); @@ -65,41 +70,51 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLPadLayerKernel::CLPadLayerKernel() - : _input(nullptr), _output(nullptr), _4d_enabled(false) +CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false) { _type = CLKernelType::ELEMENTWISE; } -void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayerKernel::configure( + const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) { configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode); } -void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +void CLPadLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding))); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3); // Set build options - const DataType &data_type = input->info()->data_type(); - const unsigned int input_width = input->info()->dimension(0); - const unsigned int input_height = input->info()->dimension(1); - const unsigned int input_depth = input->info()->dimension(2); - const unsigned int pad_x_before = padding.at(0).first; - const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0; - const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0; - const unsigned int vec_size = adjust_vec_size(std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), input_width); - const unsigned int pad_right_start = input_width + pad_x_before; - const unsigned int pad_x_before_remainder = pad_x_before % vec_size; - const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0)); + const DataType &data_type = input->info()->data_type(); + const unsigned int input_width = input->info()->dimension(0); + const unsigned int input_height = input->info()->dimension(1); + const unsigned int input_depth = input->info()->dimension(2); + const unsigned int pad_x_before = padding.at(0).first; + const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0; + const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0; + const unsigned int vec_size = adjust_vec_size( + std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), + input_width); + const unsigned int pad_right_start = input_width + pad_x_before; + const unsigned int pad_x_before_remainder = pad_x_before % vec_size; + const unsigned int vec_size_leftover_write = + vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0)); CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); @@ -108,12 +123,12 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width)); build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder)); build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write)); - if(padding.size() > 1) + if (padding.size() > 1) { build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before)); build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height)); - if(padding.size() > 2) + if (padding.size() > 2) { build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before)); build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth)); @@ -121,23 +136,25 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const } std::string kernel_name = "pad_layer_"; - switch(mode) + switch (mode) { case PaddingMode::CONSTANT: { kernel_name += "constant"; - const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start); + const unsigned int vec_size_leftover_read = + vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start); build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type)); build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read)); - if(pad_x_before >= vec_size) + if (pad_x_before >= vec_size) { build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size)); - build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size)); + build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + + support::cpp11::to_string(pad_right_start / vec_size)); } - if(_4d_enabled) + if (_4d_enabled) { build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first)); build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3))); @@ -154,14 +171,17 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const const unsigned int pad_x_after_remainder = pad_right_start % vec_size; const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect; - const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size); + const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size); build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect)); build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder)); - build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size)); - build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size)); + build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size)); + build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size)); build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x)); - build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size)); + build_opts.add_option_if(after_pad_fact_x < output_last_x, + "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size)); break; } @@ -179,7 +199,11 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +Status CLPadLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value, + PaddingMode mode) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode)); return Status{}; @@ -197,13 +221,12 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue) unsigned int idx = 0; add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); - if(_4d_enabled) + if (_4d_enabled) { add_argument<unsigned int>(idx, batch++); } enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h index 90af337f94..dca121b6a1 100644 --- a/src/core/CL/kernels/CLPadLayerKernel.h +++ b/src/core/CL/kernels/CLPadLayerKernel.h @@ -56,7 +56,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT); + void configure(const ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); /** Set the input and output tensor. * * @param[in] compile_context The compile context to be used. @@ -68,8 +72,12 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), - PaddingMode mode = PaddingMode::CONSTANT); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel * * @param[in] input Source tensor info. Data types supported: All. @@ -80,7 +88,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp index bf1b874dd0..7dcdf1de6f 100644 --- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp +++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" using namespace arm_compute::misc::shape_calculator; @@ -42,7 +42,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status validate_arguments(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); @@ -51,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, // Check variances const int var_size = info.variances().size(); - if(var_size > 1) + if (var_size > 1) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values"); - for(int i = 0; i < var_size; ++i) + for (int i = 0; i < var_size; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0"); } @@ -62,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0"); - if(!info.max_sizes().empty()) + if (!info.max_sizes().empty()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), + "Max and min sizes dimensions should match"); } - for(unsigned int i = 0; i < info.max_sizes().size(); ++i) + for (unsigned int i = 0; i < info.max_sizes().size(); ++i) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], + "Max size should be greater than min size"); } - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2); } @@ -80,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, return Status{}; } -std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, + const ITensorInfo *input2, + ITensorInfo *output, + const PriorBoxLayerInfo &info, + int num_priors) { ARM_COMPUTE_UNUSED(input2); // Output tensor auto initialization if not yet initialized @@ -88,10 +97,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input auto_init_if_empty(*output, output_shape, 1, input1->data_type()); const unsigned int num_elems_processed_per_iteration = 4 * num_priors; - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); bool window_changed = update_window_and_padding(win, output_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -102,13 +112,25 @@ CLPriorBoxLayerKernel::CLPriorBoxLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios) +void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios) { configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios); } -void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, - cl::Buffer *max, cl::Buffer *aspect_ratios) +void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); @@ -135,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c int img_width = info.img_size().x; int img_height = info.img_size().y; - if(img_width == 0 || img_height == 0) + if (img_width == 0 || img_height == 0) { img_width = input2->info()->dimension(width_idx); img_height = input2->info()->dimension(height_idx); @@ -143,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c float step_x = info.steps()[0]; float step_y = info.steps()[0]; - if(step_x == 0.f || step_y == 0.f) + if (step_x == 0.f || step_y == 0.f) { step_x = static_cast<float>(img_width) / layer_width; step_y = static_cast<float>(img_height) / layer_height; @@ -162,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset())); build_opts.add_option_if(info.clip(), "-DIN_PLACE"); - if(info.variances().size() > 1) + if (info.variances().size() > 1) { - for(unsigned int i = 0; i < info.variances().size(); ++i) + for (unsigned int i = 0; i < info.variances().size(); ++i) { - build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i))); + build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(info.variances().at(i))); } } else { - for(unsigned int i = 0; i < 4; ++i) + for (unsigned int i = 0; i < 4; ++i) { - build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0))); + build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(info.variances().at(0))); } } @@ -194,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c ICLKernel::configure_internal(win_config.second); } -Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info)); const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), + output->clone().get(), info, num_priors) + .first); return Status{}; } @@ -211,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data()); - queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data()); - if(!_info.max_sizes().empty()) + queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), + _info.aspect_ratios().data()); + if (!_info.max_sizes().empty()) { queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data()); } diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h index 6c369a7a4e..a50e0c5ff5 100644 --- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h +++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h @@ -57,7 +57,13 @@ public: * @param[in] max Maximum prior box values * @param[in] aspect_ratios Aspect ratio values */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios); + void configure(const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -69,8 +75,14 @@ public: * @param[in] max Maximum prior box values * @param[in] aspect_ratios Aspect ratio values */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, - cl::Buffer *aspect_ratios); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input1, + const ICLTensor *input2, + ICLTensor *output, + const PriorBoxLayerInfo &info, + cl::Buffer *min, + cl::Buffer *max, + cl::Buffer *aspect_ratios); /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel * * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC. @@ -80,14 +92,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input1; - const ICLTensor *_input2; + const ICLTensor *_input1; + const ICLTensor *_input2; ICLTensor *_output; PriorBoxLayerInfo _info; int _num_priors; diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp index bd573e54c8..731fcb8e04 100644 --- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp +++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp @@ -22,10 +22,12 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -49,14 +51,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size(); /* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */ /* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */ - const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration; + const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) + ? input->dimension(0) + : temp_num_elems_processed_per_iteration; // This kernel doesn't need padding Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); return std::make_pair(Status{}, win); } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output); @@ -72,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -87,10 +94,14 @@ CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel() _type = CLKernelType::ELEMENTWISE; } -void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias) +void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *weight, + const ICLTensor *bias) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output); - auto padding_info = get_padding_info({ input, weight, bias, output }); + auto padding_info = get_padding_info({input, weight, bias, output}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info())); @@ -104,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_ int32_t output_multiplier{}; int32_t output_shift{}; const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform(); - const Status status = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift); + const Status status = + quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift); output_shift *= -1; // Set build options @@ -114,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); - build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); - build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); + build_opts.add_option("-DMIN_BOUND=" + + support::cpp11::to_string(std::get<0>( + quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); + build_opts.add_option("-DMAX_BOUND=" + + support::cpp11::to_string(std::get<1>( + quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type())))); // Create kernel _kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options()); @@ -135,12 +151,18 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias) +void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, + ICLTensor *output, + const ICLTensor *weight, + const ICLTensor *bias) { configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias); } -Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); @@ -171,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue add_2D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h index 31085c37ba..ba912e1d2d 100644 --- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h +++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h @@ -63,7 +63,11 @@ public: * @param[in] weight Weight tensor. Data types supported: Same as @p input. * @param[in] bias Bias tensor. Data types supported: S32. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *weight, + const ICLTensor *bias); /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel * * @param[in] input Source tensor info with 2 dimensions. Data types supported: QSYMM16. @@ -73,7 +77,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp index 69a6fa5fa0..c97910ef79 100644 --- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp +++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,24 +43,29 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5); ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), + output->tensor_shape()); } - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16); @@ -82,12 +88,19 @@ CLROIAlignLayerKernel::CLROIAlignLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayerKernel::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info)); @@ -97,7 +110,7 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); output->info()->set_data_layout(input->info()->data_layout()); - auto padding_info = get_padding_info({ input, rois, output }); + auto padding_info = get_padding_info({input, rois, output}); _input = input; _output = output; @@ -111,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())); - build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH)))); - build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT)))); - build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL)))); + build_opts.add_option("-DMAX_DIM_X=" + + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index( + input->info()->data_layout(), DataLayoutDimension::WIDTH)))); + build_opts.add_option("-DMAX_DIM_Y=" + + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index( + input->info()->data_layout(), DataLayoutDimension::HEIGHT)))); + build_opts.add_option("-DMAX_DIM_Z=" + + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index( + input->info()->data_layout(), DataLayoutDimension::CHANNEL)))); build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())); build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())); build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale())); build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); - build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio())); + build_opts.add_option_if(pool_info.sampling_ratio() > 0, + "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio())); - if(is_qasymm) + if (is_qasymm) { const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform(); @@ -144,7 +164,10 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info)); return Status{}; diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h index 5284a5913f..2e84e5d303 100644 --- a/src/core/CL/kernels/CLROIAlignLayerKernel.h +++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h @@ -61,7 +61,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -77,7 +78,11 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -93,7 +98,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue); diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp index f6933c6cfd..1b2c414a49 100644 --- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -48,7 +49,10 @@ CLROIPoolingLayerKernel::CLROIPoolingLayerKernel() _type = CLKernelType::ELEMENTWISE; } -Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); @@ -61,10 +65,11 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height())); + ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || + (output->dimension(1) != pool_info.pooled_height())); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2)); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3)); } @@ -72,20 +77,30 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor return Status{}; } -void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayerKernel::configure(const ICLTensor *input, + const ICLTensor *rois, + ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info); } -void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info) +void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info) { - ARM_COMPUTE_ERROR_THROW_ON(CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info)); + ARM_COMPUTE_ERROR_THROW_ON( + CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info)); - auto padding_info = get_padding_info({ input, rois, output }); + auto padding_info = get_padding_info({input, rois, output}); // Output auto initialization if not yet initialized - TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1)); - auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), output->info()->quantization_info()); + TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), + rois->info()->dimension(1)); + auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), + output->info()->quantization_info()); // Set instance variables _input = input; @@ -107,11 +122,12 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())); build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale())); - if(is_qasymm) + if (is_qasymm) { // Determine quantization info scale, offset UniformQuantizationInfo uqinfo = UniformQuantizationInfo(); - uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform()); + uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), + _output->info()->quantization_info().uniform()); build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset)); build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale)); diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h index 7b7b457632..80bfb63092 100644 --- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h +++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h @@ -59,7 +59,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info); /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. @@ -74,7 +75,11 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *rois, + const ICLTensor *output, + const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -92,7 +97,10 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); private: const ICLTensor *_input; diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp index a06c2eed75..622f6210b9 100644 --- a/src/core/CL/kernels/CLRangeKernel.cpp +++ b/src/core/CL/kernels/CLRangeKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,11 +43,8 @@ constexpr unsigned int vector_size_byte_opencl = 16; Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, - DataType::U16, DataType::S16, - DataType::U32, DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output); @@ -56,19 +54,22 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), + "start value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), + "end value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), + "step value is outside the range of the data type"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), + "Output tensor size is incorrect"); return Status{}; } } // namespace -CLRangeKernel::CLRangeKernel() - : _start(0), _end(1), _step(1), _output(nullptr) +CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -78,16 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step); } -void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) +void CLRangeKernel::configure( + const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step) { ARM_COMPUTE_ERROR_ON_NULLPTR(output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step)); // Configure kernel window - unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0)); - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0)); + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - auto padding_info = get_padding_info({ output }); + auto padding_info = get_padding_info({output}); _start = start; _end = end; @@ -100,10 +103,11 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DSTART=" + support::cpp11::to_string(start)); build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step)); - if(is_data_type_quantized_asymmetric(output->info()->data_type())) + if (is_data_type_quantized_asymmetric(output->info()->data_type())) { const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform(); build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset)); diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h index 1b94a099ed..65251a11e5 100644 --- a/src/core/CL/kernels/CLRangeKernel.h +++ b/src/core/CL/kernels/CLRangeKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLRANGEKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index e5cfb997ca..70875a2d40 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -28,15 +28,15 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute @@ -47,23 +47,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - if(input->num_channels() == 1) + if (input->num_channels() == 1) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); } else { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(axis == 0); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, + "Not supported reduction operation for QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && (input->data_type() != DataType::QASYMM8) - && (input->data_type() != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer"); + ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && + (input->data_type() != DataType::QASYMM8) && + (input->data_type() != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), + "Not supported reduction operation, use CLArgMinMaxLayer"); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -79,33 +84,42 @@ CLReductionOperationKernel::CLReductionOperationKernel() _type = CLKernelType::ELEMENTWISE; } -void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLReductionOperationKernel::configure(const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op); } -void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; _reduction_axis = axis; _op = op; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true)); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true)); // Set build options CLBuildOptions build_opts; DataType data_type = input->info()->data_type(); std::string data_type_promoted{}; - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { data_type_promoted = "int"; } @@ -130,10 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD"); build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN"); build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX"); - build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset)); - build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale)); - - switch(op) + build_opts.add_option_if(is_data_type_quantized(data_type), + "-DOFFSET=" + + support::cpp11::to_string(input->info()->quantization_info().uniform().offset)); + build_opts.add_option_if( + is_data_type_quantized(data_type), + "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale)); + + switch (op) { case ReductionOperation::SUM_SQUARE: build_opts.add_option(("-DOPERATION=square_sum")); @@ -159,7 +177,7 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte std::string kernel_axis_name; const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis); - switch(axis) + switch (axis) { case 0: { @@ -187,13 +205,17 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(vec_size)); - win.set(Window::DimX, Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step())); + win.set(Window::DimX, + Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step())); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status CLReductionOperationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); return Status{}; @@ -205,18 +227,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis); - switch(_reduction_axis) + switch (_reduction_axis) { case 0: { // We use parallel reduction only in non quantized types - if(is_serial_op) + if (is_serial_op) { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + Window window_in{window}; + window_in.set(Window::DimX, + Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); - Window out_window{ window }; + Window out_window{window}; out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); Window in_slice = window_in.first_slice_window_1D(); @@ -228,8 +251,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_1D_tensor_argument(idx, _input, in_slice); add_1D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); + } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); } else { @@ -251,8 +273,9 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que case 1: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); + Window window_in{window}; + window_in.set(Window::DimY, + Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); Window in_slice = window_in.first_slice_window_2D(); Window out_slice = window.first_slice_window_2D(); @@ -262,15 +285,15 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_2D_tensor_argument(idx, _input, in_slice); add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } break; case 2: { // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); + Window window_in{window}; + window_in.set(Window::DimZ, + Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); Window in_slice = window_in.first_slice_window_3D(); Window out_slice = window.first_slice_window_3D(); @@ -280,14 +303,13 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_3D_tensor_argument(idx, _input, in_slice); add_3D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); + } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); } break; case 3: { // Get first input and output slices - Window window_in{ window }; + Window window_in{window}; window_in.set(3, Window::Dimension(0, 1, 1)); Window in_slice = window_in.first_slice_window_4D(); Window out_slice = window.first_slice_window_4D(); @@ -298,8 +320,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que add_4D_tensor_argument(idx, _input, in_slice); add_4D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); + } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); } break; default: diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h index b456378746..2f94b2add3 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.h +++ b/src/core/CL/kernels/CLReductionOperationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -67,7 +68,11 @@ public: * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + unsigned int axis, + ReductionOperation op); /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel. * @@ -79,7 +84,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp index 3c74e80d33..9fd21943e8 100644 --- a/src/core/CL/kernels/CLReorgLayerKernel.cpp +++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp @@ -28,9 +28,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -51,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, + "The width of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, + "The height of the input tensor must be a multiple of stride"); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); + const TensorInfo tensor_info_output = + output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -66,8 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -CLReorgLayerKernel::CLReorgLayerKernel() - : _input(nullptr), _output(nullptr) +CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -77,17 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in configure(CLKernelLibrary::get().get_compile_context(), input, output, stride); } -void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride) +void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t stride) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride)); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); _input = input; _output = output; - std::string kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout())); - const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + std::string kernel_name = + std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout())); + const size_t idx_channel = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); // Create kernel CLBuildOptions build_opts; @@ -98,7 +106,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons // Configure window // auto inizialize the output tensor if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride))); + auto_init_if_empty(*output->info(), + input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride))); Window win = calculate_max_window(*output->info(), Steps()); @@ -119,7 +129,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride) +Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, + const arm_compute::ITensorInfo *output, + int32_t stride) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride)); @@ -139,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue) add_3D_tensor_argument(idx, _input, slice); add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h index 455a6170c6..f335071e9f 100644 --- a/src/core/CL/kernels/CLReorgLayerKernel.h +++ b/src/core/CL/kernels/CLReorgLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLREORGLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp index 0d70ff4f3c..79a0f03b1e 100644 --- a/src/core/CL/kernels/CLReverseKernel.cpp +++ b/src/core/CL/kernels/CLReverseKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -60,8 +61,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLReverseKernel::CLReverseKernel() - : _input(nullptr), _output(nullptr), _axis(nullptr) +CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -71,10 +71,13 @@ void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const configure(CLKernelLibrary::get().get_compile_context(), input, output, axis); } -void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis) +void CLReverseKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis); - auto padding_info = get_padding_info({ input, output, axis }); + auto padding_info = get_padding_info({input, output, axis}); _input = input; _output = output; @@ -138,7 +141,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue) add_1D_tensor_argument(idx, _axis, axis_slice); add_4D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_4D(slice)); + } while (collapsed.slide_window_slice_4D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h index 4a21e4f802..fbd99dc883 100644 --- a/src/core/CL/kernels/CLReverseKernel.h +++ b/src/core/CL/kernels/CLReverseKernel.h @@ -60,7 +60,10 @@ public: * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32 */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const ICLTensor *axis); /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel * diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp index c0e014e8b8..703c64d8d3 100644 --- a/src/core/CL/kernels/CLSelectKernel.cpp +++ b/src/core/CL/kernels/CLSelectKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" namespace arm_compute @@ -51,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape())); - ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); + ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && + ((c->tensor_shape().num_dimensions() > 1) || + (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output); @@ -63,13 +65,16 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen } } // namespace -CLSelectKernel::CLSelectKernel() - : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false) +CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false) { _type = CLKernelType::ELEMENTWISE; } -void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output) +void CLSelectKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info())); @@ -80,7 +85,7 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC _output = output; _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions()); - auto padding_info = get_padding_info({ c, x, y, output }); + auto padding_info = get_padding_info({c, x, y, output}); const unsigned int vec_size_x = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0)); const int vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x; @@ -92,14 +97,14 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC // Create kernel std::string kernel_name = "select"; - if(_has_same_rank) + if (_has_same_rank) { kernel_name += "_same_rank"; } else { const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2; - if(is_input_rank_greater_than_two) + if (is_input_rank_greater_than_two) { const size_t width = x->info()->tensor_shape().x(); const size_t height = x->info()->tensor_shape().y(); @@ -128,7 +133,8 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) +Status +CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output)); return Status{}; @@ -142,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); - if(!_has_same_rank) + if (!_has_same_rank) { Window vector_slice = window.first_slice_window_1D(); vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0)); @@ -153,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu do { unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor(); - if(_has_same_rank) + if (_has_same_rank) { add_3D_tensor_argument(idx, _c, slice); } @@ -162,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h index b8c10cd7cf..c4256fd743 100644 --- a/src/core/CL/kernels/CLSelectKernel.h +++ b/src/core/CL/kernels/CLSelectKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLSELECTKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -60,7 +61,11 @@ public: * @param[out] y Second input tensor. Data types supported: Same as @p x * @param[in] output Output tensor. Data types supported: Same as @p x. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *c, + const ICLTensor *x, + const ICLTensor *y, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel * * @param[in] c Condition input tensor. Data types supported: U8. diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp index 3632ae2b03..f4c0839ad2 100644 --- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp +++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,19 +39,22 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *block_info, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2}); ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2}); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); @@ -61,7 +65,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status validate_arguments_static(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -70,9 +78,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right); + TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input, block_shape_x, block_shape_y, padding_left, padding_right); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -88,16 +97,24 @@ CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel() _type = CLKernelType::ELEMENTWISE; } -void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output); } -void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); - auto padding_info = get_padding_info({ input, block_shape, paddings, output }); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); + auto padding_info = get_padding_info({input, block_shape, paddings, output}); _input = input; _block_shape = block_shape; @@ -111,14 +128,17 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width))); build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height))); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch))); build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width))); build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height))); build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch))); - _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel(compile_context, + "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -126,22 +146,34 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, - ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { - configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output); + configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, + padding_right, output); } -void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, - const Size2D &padding_right, - ICLTensor *output) +void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input->info(), block_shape_x, block_shape_y, padding_left, padding_right); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, + padding_right, output->info())); _input = input; _output = output; @@ -153,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type()))); build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width))); build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height))); build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch))); @@ -166,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x())); build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y())); build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y())); - _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel( + compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); ICLKernel::configure_internal(win); } -Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output)); return Status{}; } -Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -218,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu add_3D_tensor_argument(idx, _output, slice_out); enqueue(queue, *this, slice_out, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h index 4817cfeef2..f9dce9db47 100644 --- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h +++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -63,7 +64,11 @@ public: * @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32 * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const ICLTensor *block_shape, + const ICLTensor *paddings, + ICLTensor *output); /** Initialise the kernel's input and output. (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -73,7 +78,12 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output); + void configure(const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output); /** Initialise the kernel's input and output. (Static block shape and paddings) * * @param[in] compile_context The compile context to be used. @@ -84,8 +94,13 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, - ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -95,7 +110,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -107,7 +125,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp index c5ffdb588b..25662b5c62 100644 --- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp +++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -45,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -64,8 +65,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape() +CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape() { _type = CLKernelType::ELEMENTWISE; } @@ -75,10 +75,13 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape); } -void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape) +void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, output }); + auto padding_info = get_padding_info({input, output}); TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); @@ -94,11 +97,14 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type()))); build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel))); build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape)); build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width))); - _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()); + _kernel = create_kernel(compile_context, + "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), + build_opts.options()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -136,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu enqueue(queue, *this, slice_out, lws_hint()); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h index bb1ac5f9a6..d0932919e0 100644 --- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h +++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -61,7 +62,8 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] block_shape Block shape value. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); + void + configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape); /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel. * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp index 075c93ab60..23e26716e7 100644 --- a/src/core/CL/kernels/CLStackLayerKernel.cpp +++ b/src/core/CL/kernels/CLStackLayerKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/StringSupport.h" using namespace arm_compute::misc::shape_calculator; @@ -42,7 +42,11 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); @@ -51,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + compute_stack_shape(*input, axis, num_tensors)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } @@ -61,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors))); @@ -73,18 +79,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi } } // namespace -CLStackLayerKernel::CLStackLayerKernel() - : _input(nullptr), _output(nullptr) +CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } -void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output) +void CLStackLayerKernel::configure( + const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output) { configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output); } -void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output) +void CLStackLayerKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + ICLTensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info())); @@ -112,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons _kernel.setArg<cl_uint>(idx, idx_input); } -Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status CLStackLayerKernel::validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); return Status{}; } diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h index 2865127a90..d3c17f529c 100644 --- a/src/core/CL/kernels/CLStackLayerKernel.h +++ b/src/core/CL/kernels/CLStackLayerKernel.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_CLSTACKLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute @@ -60,7 +61,8 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output); + void configure( + const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output); /** Initialise the kernel's inputs and output * * @note Supported input tensor rank: up to 4 @@ -74,7 +76,12 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel * * @note Supported input tensor rank: up to 4 @@ -88,7 +95,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp index 9acbafdb19..a8f6112820 100644 --- a/src/core/CL/kernels/CLStridedSliceKernel.cpp +++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp @@ -22,11 +22,13 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLStridedSliceKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/helpers/bit_ops.h" @@ -37,9 +39,14 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); @@ -48,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) - { - return i == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; })); // Get expected output shape - const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0); // Checks output if configured - if(output->total_size() != 0) + if (output->total_size() != 0) { const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info); @@ -76,28 +80,33 @@ CLStridedSliceKernel::CLStridedSliceKernel() _type = CLKernelType::ELEMENTWISE; } -void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - auto padding_info = get_padding_info({ input, output }); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + auto padding_info = get_padding_info({input, output}); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); const TensorShape &input_shape = input->tensor_shape(); Coordinates starts_abs; Coordinates ends_abs; Coordinates final_strides; - std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords( - input_shape, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + std::tie(starts_abs, ends_abs, final_strides) = + arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides, + begin_mask, end_mask, shrink_axis_mask); // Configure kernel window - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); Window win = calculate_max_window(*output, Steps()); @@ -108,29 +117,33 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0); // Update window if needed - if(multi_access_x) + if (multi_access_x) { Window &updated_window = win; updated_window.set(Window::DimX, - Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x)); + Window::Dimension(updated_window.x().start(), + ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); // Create build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type()))); - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type()))); + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i); - build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i])); - build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i])); + build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(starts_abs[i])); + build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + + support::cpp11::to_string(final_strides[i])); build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i)); } - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string( + std::max<int>(output_width_x - vec_size_x, 0))); build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option_if_else(input_shape.num_dimensions() > 2, - "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), - "-DSRC_DEPTH=1"); + "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), "-DSRC_DEPTH=1"); build_opts.add_option_if_else(output->num_dimensions() > 2, "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()), "-DDST_DEPTH=1"); @@ -142,7 +155,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co _config_id = "strided_slice"; _config_id += "_"; _config_id += lower_string(string_from_data_type(input->data_type())); - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { _config_id += "_"; _config_id += support::cpp11::to_string(input->dimension(i)); @@ -156,11 +169,17 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status CLStridedSliceKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); return Status{}; } @@ -170,8 +189,9 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = window_collapsed.first_slice_window_4D(); @@ -182,7 +202,6 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl add_4D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_4D(slice)); + } while (window_collapsed.slide_window_slice_4D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h index 4c201504f5..1cf5bcacec 100644 --- a/src/core/CL/kernels/CLStridedSliceKernel.h +++ b/src/core/CL/kernels/CLStridedSliceKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/CL/ICLKernel.h" #include <cstdint> @@ -53,9 +54,15 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel * @@ -71,9 +78,14 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp index 3e7015cfd2..fa996c4008 100644 --- a/src/core/CL/kernels/CLTileKernel.cpp +++ b/src/core/CL/kernels/CLTileKernel.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "src/core/CL/kernels/CLTileKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" @@ -39,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4); ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) - { - return e == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; })); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -55,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -CLTileKernel::CLTileKernel() - : _input(nullptr), _output(nullptr) +CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr) { _type = CLKernelType::ELEMENTWISE; } @@ -66,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples); } -void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples) +void CLTileKernel::configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -104,15 +106,14 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT // Configure window without padding Window win = calculate_max_window(*output->info()); - if(multi_access_x) + if (multi_access_x) { // If multi-access is enabled, no thread should cross the tile boundaries. This means we need // as many threads as those to cover a single tile times multiples[0]. Note that if threads // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and // we don't need to pad the output const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0]; - win.set(Window::DimX, - Window::Dimension(win.x().start(), size_win_x, vec_size_x)); + win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x)); } ICLKernel::configure_internal(win); @@ -121,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT _config_id = "tile"; _config_id += "_"; _config_id += lower_string(string_from_data_type(input->info()->data_type())); - for(unsigned int i = 0; i < multiples.size(); ++i) + for (unsigned int i = 0; i < multiples.size(); ++i) { _config_id += "_"; _config_id += support::cpp11::to_string(input->info()->dimension(i)); @@ -150,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue) add_4D_tensor_argument(idx, _input, slice); add_4D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_4D(slice)); + } while (collapsed.slide_window_slice_4D(slice)); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h index 41752ca90b..c3486aecef 100644 --- a/src/core/CL/kernels/CLTileKernel.h +++ b/src/core/CL/kernels/CLTileKernel.h @@ -64,7 +64,10 @@ public: * @param[out] output Destination tensor. Same as @p input * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples); + void configure(const CLCompileContext &compile_context, + const ICLTensor *input, + ICLTensor *output, + const Multiples &multiples); /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel * * @param[in] input Source tensor info. Data type supported: All. diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp index 6a3f66fd5a..9980db42f3 100644 --- a/src/core/CPP/CPPTypes.cpp +++ b/src/core/CPP/CPPTypes.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/CPP/CPPTypes.h" #include "arm_compute/core/Error.h" + #include "src/common/cpuinfo/CpuInfo.h" #include "src/common/cpuinfo/CpuIsaInfo.h" @@ -43,8 +44,7 @@ CPUInfo &CPUInfo::get() return _cpuinfo; } -CPUInfo::CPUInfo() - : _impl(std::make_unique<Impl>()) +CPUInfo::CPUInfo() : _impl(std::make_unique<Impl>()) { _impl->info = cpuinfo::CpuInfo::build(); } diff --git a/src/core/CPP/Validate.h b/src/core/CPP/Validate.h index df192b5131..fe253508cf 100644 --- a/src/core/CPP/Validate.h +++ b/src/core/CPP/Validate.h @@ -38,8 +38,8 @@ namespace arm_compute * * @return Status */ -inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, - const ITensorInfo *tensor_info) +inline Status +error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info) { bool fp16_kernels_enabled = false; #if defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) @@ -47,8 +47,9 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi #endif /* defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) */ ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), - function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above"); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG( + (tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), function, + file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above"); return Status{}; } @@ -61,8 +62,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi * * @return Status */ -inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, - const ITensorInfo *tensor_info) +inline Status +error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info) { bool bf16_kernels_enabled = false; #if defined(ARM_COMPUTE_ENABLE_BF16) @@ -70,8 +71,9 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled), - function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above"); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG( + (tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled), + function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above"); return Status{}; } @@ -84,8 +86,8 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi * * @return Status */ -inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, - const ITensor *tensor) +inline Status +error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensor *tensor) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info())); @@ -101,8 +103,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi * * @return Status */ -inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, - const ITensor *tensor) +inline Status +error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensor *tensor) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info())); diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp index 0f405d8e83..02686eb4f6 100644 --- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp +++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h" #include "arm_compute/core/Helpers.h" + #include "src/core/helpers/WindowHelpers.h" #include <algorithm> @@ -34,7 +35,11 @@ namespace arm_compute namespace { template <typename T> -std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id) +std::vector<int> SoftNMS(const ITensor *proposals, + std::vector<std::vector<T>> &scores_in, + std::vector<int> inds, + const BoxNMSLimitInfo &info, + int class_id) { std::vector<int> keep; const int proposals_width = proposals->info()->dimension(1); @@ -45,7 +50,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> & std::vector<T> y2(proposals_width); std::vector<T> areas(proposals_width); - for(int i = 0; i < proposals_width; ++i) + for (int i = 0; i < proposals_width; ++i) { x1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i))); y1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i))); @@ -56,13 +61,13 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> & // Note: Soft NMS scores have already been initialized with input scores - while(!inds.empty()) + while (!inds.empty()) { // Find proposal with max score among remaining proposals int max_pos = 0; - for(unsigned int i = 1; i < inds.size(); ++i) + for (unsigned int i = 1; i < inds.size(); ++i) { - if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)]) + if (scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)]) { max_pos = i; } @@ -75,7 +80,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> & inds.erase(inds.begin()); std::vector<int> sorted_indices_temp; - for(auto idx : inds) + for (auto idx : inds) { const auto xx1 = std::max(x1[idx], x1[element]); const auto yy1 = std::max(y1[idx], y1[element]); @@ -89,7 +94,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> & // Update scores based on computed IoU, overlap threshold and NMS method T weight; - switch(info.soft_nms_method()) + switch (info.soft_nms_method()) { case NMSType::LINEAR: weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f; @@ -106,7 +111,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> & // Discard boxes with new scores below min threshold and update pending indices scores_in[class_id][idx] *= weight; - if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres()) + if (scores_in[class_id][idx] >= info.soft_nms_min_score_thres()) { sorted_indices_temp.push_back(idx); } @@ -118,7 +123,10 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> & } template <typename T> -std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id) +std::vector<int> NonMaximaSuppression(const ITensor *proposals, + std::vector<int> sorted_indices, + const BoxNMSLimitInfo &info, + int class_id) { std::vector<int> keep; @@ -130,7 +138,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> std::vector<T> y2(proposals_width); std::vector<T> areas(proposals_width); - for(int i = 0; i < proposals_width; ++i) + for (int i = 0; i < proposals_width; ++i) { x1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i))); y1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i))); @@ -139,7 +147,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0); } - while(!sorted_indices.empty()) + while (!sorted_indices.empty()) { int i = sorted_indices.at(0); keep.push_back(i); @@ -148,7 +156,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> std::vector<int> new_indices; sorted_indices_temp.erase(sorted_indices_temp.begin()); - for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j) + for (unsigned int j = 0; j < sorted_indices_temp.size(); ++j) { const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]); const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]); @@ -163,8 +171,9 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> const float ctr_y = yy1 + (h / 2); // If suppress_size is specified, filter the boxes based on their size and position - const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height()); - if(ovr <= info.nms() && keep_size) + const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && + ctr_x < info.im_width() && ctr_y < info.im_height()); + if (ovr <= info.nms() && keep_size) { new_indices.push_back(j); } @@ -172,7 +181,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> const unsigned int new_indices_size = new_indices.size(); std::vector<int> new_sorted_indices(new_indices_size); - for(unsigned int i = 0; i < new_indices_size; ++i) + for (unsigned int i = 0; i < new_indices_size; ++i) { new_sorted_indices[i] = sorted_indices[new_indices[i] + 1]; } @@ -184,7 +193,15 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> } // namespace CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel() - : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr), + : _scores_in(nullptr), + _boxes_in(nullptr), + _batch_splits_in(nullptr), + _scores_out(nullptr), + _boxes_out(nullptr), + _classes(nullptr), + _batch_splits_out(nullptr), + _keeps(nullptr), + _keeps_size(nullptr), _info() { } @@ -197,7 +214,7 @@ bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const template <typename T> void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() { - const int batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0); + const int batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0); const int num_classes = _scores_in->info()->dimension(0); const int scores_count = _scores_in->info()->dimension(1); std::vector<int> total_keep_per_batch(batch_size); @@ -205,51 +222,48 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() int total_keep_count = 0; std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count)); - for(int i = 0; i < scores_count; ++i) + for (int i = 0; i < scores_count; ++i) { - for(int j = 0; j < num_classes; ++j) + for (int j = 0; j < num_classes; ++j) { in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i))); } } int cur_start_idx = 0; - for(int b = 0; b < batch_size; ++b) + for (int b = 0; b < batch_size; ++b) { // Skip first class if there is more than 1 except if the number of classes is 1. const int j_start = (num_classes == 1 ? 0 : 1); - for(int j = j_start; j < num_classes; ++j) + for (int j = j_start; j < num_classes; ++j) { std::vector<T> cur_scores(scores_count); std::vector<int> inds; - for(int i = 0; i < scores_count; ++i) + for (int i = 0; i < scores_count; ++i) { const T score = in_scores[j][i]; cur_scores[i] = score; - if(score > _info.score_thresh()) + if (score > _info.score_thresh()) { inds.push_back(i); } } - if(_info.soft_nms_enabled()) + if (_info.soft_nms_enabled()) { keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j); } else { std::sort(inds.data(), inds.data() + inds.size(), - [&cur_scores](int lhs, int rhs) - { - return cur_scores[lhs] > cur_scores[rhs]; - }); + [&cur_scores](int lhs, int rhs) { return cur_scores[lhs] > cur_scores[rhs]; }); keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j); } total_keep_count += keeps[j].size(); } - if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im()) + if (_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im()) { // merge all scores (represented by indices) together and sort auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]() @@ -257,10 +271,10 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() std::vector<T> ret(total_keep_count); int ret_idx = 0; - for(unsigned int i = 1; i < keeps.size(); ++i) + for (unsigned int i = 1; i < keeps.size(); ++i) { auto &cur_keep = keeps[i]; - for(auto &ckv : cur_keep) + for (auto &ckv : cur_keep) { ret[ret_idx++] = in_scores[i][ckv]; } @@ -273,13 +287,13 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() auto all_scores_sorted = get_all_scores_sorted(); const T image_thresh = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()]; - for(int j = 1; j < num_classes; ++j) + for (int j = 1; j < num_classes; ++j) { auto &cur_keep = keeps[j]; std::vector<int> new_keeps_j; - for(auto &k : cur_keep) + for (auto &k : cur_keep) { - if(in_scores[j][k] >= image_thresh) + if (in_scores[j][k] >= image_thresh) { new_keeps_j.push_back(k); } @@ -293,40 +307,52 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() // Write results int cur_out_idx = 0; - for(int j = j_start; j < num_classes; ++j) + for (int j = j_start; j < num_classes; ++j) { - auto &cur_keep = keeps[j]; - auto cur_out_scores = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx))); - auto cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx))); - const int box_column = (cur_start_idx + cur_out_idx) * 4; - - for(unsigned int k = 0; k < cur_keep.size(); ++k) + auto &cur_keep = keeps[j]; + auto cur_out_scores = + reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx))); + auto cur_out_classes = + reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx))); + const int box_column = (cur_start_idx + cur_out_idx) * 4; + + for (unsigned int k = 0; k < cur_keep.size(); ++k) { - cur_out_scores[k] = in_scores[j][cur_keep[k]]; - cur_out_classes[k] = static_cast<T>(j); - auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k))); - auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k))); - auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k))); - auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k))); - *cur_out_box_row0 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k]))); - *cur_out_box_row1 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k]))); - *cur_out_box_row2 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k]))); - *cur_out_box_row3 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k]))); + cur_out_scores[k] = in_scores[j][cur_keep[k]]; + cur_out_classes[k] = static_cast<T>(j); + auto cur_out_box_row0 = + reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k))); + auto cur_out_box_row1 = + reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k))); + auto cur_out_box_row2 = + reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k))); + auto cur_out_box_row3 = + reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k))); + *cur_out_box_row0 = + *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k]))); + *cur_out_box_row1 = + *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k]))); + *cur_out_box_row2 = + *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k]))); + *cur_out_box_row3 = + *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k]))); } cur_out_idx += cur_keep.size(); } - if(_keeps != nullptr) + if (_keeps != nullptr) { cur_out_idx = 0; - for(int j = 0; j < num_classes; ++j) + for (int j = 0; j < num_classes; ++j) { - for(unsigned int i = 0; i < keeps[j].size(); ++i) + for (unsigned int i = 0; i < keeps[j].size(); ++i) { - *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i)); + *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = + static_cast<T>(keeps[j].at(i)); } - *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size(); + *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = + keeps[j].size(); cur_out_idx += keeps[j].size(); } } @@ -334,17 +360,25 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit() cur_start_idx += total_keep_count; } - if(_batch_splits_out != nullptr) + if (_batch_splits_out != nullptr) { - for(int b = 0; b < batch_size; ++b) + for (int b = 0; b < batch_size; ++b) { *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b]; } } } -void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes, - ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info) +void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, + const ITensor *boxes_in, + const ITensor *batch_splits_in, + ITensor *scores_out, + ITensor *boxes_out, + ITensor *classes, + ITensor *batch_splits_out, + ITensor *keeps, + ITensor *keeps_size, + const BoxNMSLimitInfo info) { ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32); @@ -352,25 +386,28 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_ const unsigned int num_classes = scores_in->info()->dimension(0); ARM_COMPUTE_UNUSED(num_classes); - ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes"); - ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows"); + ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), + "First dimension of input boxes must be of size 4*num_classes"); + ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), + "Input scores and input boxes must have the same number of rows"); ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1)); ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4); ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != classes->info()->dimension(0)); - if(keeps != nullptr) + if (keeps != nullptr) { - ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output"); + ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, + "keeps_size cannot be nullptr if keeps has to be provided as output"); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32); ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0)); ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0)); } - if(batch_splits_in != nullptr) + if (batch_splits_in != nullptr) { ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in); } - if(batch_splits_out != nullptr) + if (batch_splits_out != nullptr) { ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out); } @@ -399,7 +436,7 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window); - switch(_scores_in->info()->data_type()) + switch (_scores_in->info()->data_type()) { case DataType::F32: run_nmslimit<float>(); diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp index c1187ff2b3..1224ec14a7 100644 --- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp +++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp @@ -35,15 +35,22 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size, - const float score_threshold, const float iou_threshold) +Status validate_arguments(const ITensorInfo *bboxes, + const ITensorInfo *scores, + const ITensorInfo *output_indices, + unsigned int max_output_size, + const float score_threshold, + const float iou_threshold) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, output_indices); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_indices, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes]."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, + "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, + "The scores tensor must be a 1-D float tensor of shape [num_boxes]."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, + "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->dimension(0) == 0, "Indices tensor must be bigger than 0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0"); @@ -55,15 +62,26 @@ Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, } // namespace CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel() - : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0) + : _input_bboxes(nullptr), + _input_scores(nullptr), + _output_indices(nullptr), + _max_output_size(0), + _score_threshold(0.f), + _iou_threshold(0.f), + _num_boxes(0) { } -void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, - unsigned int max_output_size, const float score_threshold, const float iou_threshold) +void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, + const ITensor *input_scores, + ITensor *output_indices, + unsigned int max_output_size, + const float score_threshold, + const float iou_threshold) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), + max_output_size, score_threshold, iou_threshold)); auto_init_if_empty(*output_indices->info(), TensorShape(max_output_size), 1, DataType::U8, QuantizationInfo()); @@ -82,10 +100,15 @@ void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, cons ICPPKernel::configure(win); } -Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, - unsigned int max_output_size, const float score_threshold, const float iou_threshold) +Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, + const ITensorInfo *scores, + const ITensorInfo *output_indices, + unsigned int max_output_size, + const float score_threshold, + const float iou_threshold) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold)); return Status{}; } @@ -99,10 +122,10 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo // Auxiliary tensors std::vector<int> indices_above_thd; std::vector<float> scores_above_thd; - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { const float score_i = *(reinterpret_cast<float *>(_input_scores->ptr_to_element(Coordinates(i)))); - if(score_i >= _score_threshold) + if (score_i >= _score_threshold) { scores_above_thd.emplace_back(score_i); indices_above_thd.emplace_back(i); @@ -114,12 +137,9 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo std::vector<unsigned int> sorted_indices; sorted_indices.resize(num_above_thd); std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0); - std::sort(std::begin(sorted_indices), - std::end(sorted_indices), + std::sort(std::begin(sorted_indices), std::end(sorted_indices), [&](unsigned int first, unsigned int second) - { - return scores_above_thd[first] > scores_above_thd[second]; - }); + { return scores_above_thd[first] > scores_above_thd[second]; }); // Number of output is the minimum between max_detection and the scores above the threshold const unsigned int num_output = std::min(_max_output_size, num_above_thd); @@ -127,19 +147,20 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo std::vector<bool> visited(num_above_thd, false); // Keep only boxes with small IoU - for(unsigned int i = 0; i < num_above_thd; ++i) + for (unsigned int i = 0; i < num_above_thd; ++i) { // Check if the output is full - if(output_idx >= num_output) + if (output_idx >= num_output) { break; } // Check if it was already visited, if not add it to the output and update the indices counter - if(!visited[sorted_indices[i]]) + if (!visited[sorted_indices[i]]) { - *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]]; - visited[sorted_indices[i]] = true; + *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = + indices_above_thd[sorted_indices[i]]; + visited[sorted_indices[i]] = true; ++output_idx; } else @@ -148,28 +169,36 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo } // Once added one element at the output check if the next ones overlap and can be skipped - for(unsigned int j = i + 1; j < num_above_thd; ++j) + for (unsigned int j = i + 1; j < num_above_thd; ++j) { - if(!visited[sorted_indices[j]]) + if (!visited[sorted_indices[j]]) { // Calculate IoU const unsigned int i_index = indices_above_thd[sorted_indices[i]]; const unsigned int j_index = indices_above_thd[sorted_indices[j]]; // Box-corner format: xmin, ymin, xmax, ymax - const auto box_i_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index)))); - const auto box_i_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index)))); - const auto box_i_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index)))); - const auto box_i_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index)))); - - const auto box_j_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index)))); - const auto box_j_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index)))); - const auto box_j_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index)))); - const auto box_j_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index)))); + const auto box_i_xmin = + *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index)))); + const auto box_i_ymin = + *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index)))); + const auto box_i_xmax = + *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index)))); + const auto box_i_ymax = + *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index)))); + + const auto box_j_xmin = + *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index)))); + const auto box_j_ymin = + *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index)))); + const auto box_j_xmax = + *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index)))); + const auto box_j_ymax = + *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index)))); const float area_i = (box_i_xmax - box_i_xmin) * (box_i_ymax - box_i_ymin); const float area_j = (box_j_xmax - box_j_xmin) * (box_j_ymax - box_j_ymin); float overlap; - if(area_i <= 0 || area_j <= 0) + if (area_i <= 0 || area_j <= 0) { overlap = 0.0f; } @@ -179,11 +208,12 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo const auto x_min_intersection = std::max<float>(box_i_xmin, box_j_xmin); const auto y_max_intersection = std::min<float>(box_i_ymax, box_j_ymax); const auto x_max_intersection = std::min<float>(box_i_xmax, box_j_xmax); - const auto area_intersection = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) * std::max<float>(x_max_intersection - x_min_intersection, 0.0f); - overlap = area_intersection / (area_i + area_j - area_intersection); + const auto area_intersection = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) * + std::max<float>(x_max_intersection - x_min_intersection, 0.0f); + overlap = area_intersection / (area_i + area_j - area_intersection); } - if(overlap > _iou_threshold) + if (overlap > _iou_threshold) { visited[sorted_indices[j]] = true; } @@ -192,7 +222,7 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo } // The output could be full but not the output indices tensor // Instead return values not valid we put -1 - for(; output_idx < _max_output_size; ++output_idx) + for (; output_idx < _max_output_size; ++output_idx) { *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = -1; } diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp index 054c7bf05a..e68090d82b 100644 --- a/src/core/CPP/kernels/CPPPermuteKernel.cpp +++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -65,7 +66,7 @@ void CPPPermuteKernel::run_permute(const Window &window) // Create output window Window window_out(window); const Window::Dimension zero_window = Window::Dimension(0, 0, 0); - for(size_t d = 0; d <= _perm.num_dimensions(); ++d) + for (size_t d = 0; d <= _perm.num_dimensions(); ++d) { window_out.set(d, zero_window); } @@ -74,28 +75,32 @@ void CPPPermuteKernel::run_permute(const Window &window) Iterator in(_input, window); Iterator out(_output, window_out); - if(_input->info()->num_dimensions() <= 3) + if (_input->info()->num_dimensions() <= 3) { - execute_window_loop(window, [&](const Coordinates & id) - { - const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2]; - *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr())); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2]; + *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr())); + }, + in, out); } - else if(_input->info()->num_dimensions() >= 4) + else if (_input->info()->num_dimensions() >= 4) { - execute_window_loop(window, [&](const Coordinates & id) - { - const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3]; - *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr())); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + + id[3] * perm_strides[3]; + *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr())); + }, + in, out); } } -CPPPermuteKernel::CPPPermuteKernel() - : _func(), _input(nullptr), _output(nullptr), _perm() +CPPPermuteKernel::CPPPermuteKernel() : _func(), _input(nullptr), _output(nullptr), _perm() { } @@ -113,7 +118,7 @@ void CPPPermuteKernel::configure(const ITensor *input, ITensor *output, const Pe _output = output; _perm = perm; - switch(input->info()->element_size()) + switch (input->info()->element_size()) { case 1: _func = &CPPPermuteKernel::run_permute<uint8_t>; @@ -152,7 +157,7 @@ void CPPPermuteKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - if(_func != nullptr) + if (_func != nullptr) { (this->*_func)(window); } diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp index d2b54e412e..6ffb68e770 100644 --- a/src/core/CPP/kernels/CPPTopKVKernel.cpp +++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp @@ -34,32 +34,34 @@ namespace arm_compute { namespace { -template <typename T, - typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0> +template <typename T, typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0> inline bool greater_than(T a, T b) { const T epsilon = std::numeric_limits<T>::epsilon(); return (a - b > epsilon); } -template < typename T, - typename std::enable_if < !utils::traits::is_floating_point<T>::value, int >::type = 0 > +template <typename T, typename std::enable_if<!utils::traits::is_floating_point<T>::value, int>::type = 0> inline bool greater_than(T a, T b) { return (a > b); } -Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k) +Status validate_arguments(const ITensorInfo *predictions, + const ITensorInfo *targets, + ITensorInfo *output, + const unsigned int k) { ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32); ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1)); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); @@ -72,22 +74,23 @@ Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *tar template <typename T> void CPPTopKVKernel::run_topkv() { - for(unsigned int i = 0; i < _batch_size; ++i) + for (unsigned int i = 0; i < _batch_size; ++i) { - const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{ i })); - const auto predicted_value = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ target_class_id, i })); + const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{i})); + const auto predicted_value = + *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{target_class_id, i})); // The variable rank indicates how many values there are before the target_class_id unsigned int rank = 0; - for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j) + for (unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j) { - const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ j, i })); - if(greater_than(current_prediction, predicted_value)) + const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{j, i})); + if (greater_than(current_prediction, predicted_value)) { rank++; } } - *(_output->ptr_to_element(Coordinates{ i })) = static_cast<uint8_t>(rank < _k); + *(_output->ptr_to_element(Coordinates{i})) = static_cast<uint8_t>(rank < _k); } } @@ -96,7 +99,10 @@ CPPTopKVKernel::CPPTopKVKernel() { } -void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k) +void CPPTopKVKernel::configure(const ITensor *predictions, + const ITensor *targets, + ITensor *output, + const unsigned int k) { ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output); @@ -115,7 +121,10 @@ void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *target ICPPKernel::configure(Window()); // Default 1 iteration window } -Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k) +Status CPPTopKVKernel::validate(const ITensorInfo *predictions, + const ITensorInfo *targets, + ITensorInfo *output, + const unsigned int k) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k)); return Status{}; @@ -129,7 +138,7 @@ bool CPPTopKVKernel::is_parallelisable() const void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(window, info); - switch(_predictions->info()->data_type()) + switch (_predictions->info()->data_type()) { case DataType::F32: run_topkv<float>(); diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp index 7ef83fb2c4..b1efe32446 100644 --- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp +++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h" #include "arm_compute/core/Helpers.h" + #include "src/core/helpers/WindowHelpers.h" #include <cstddef> @@ -31,8 +32,7 @@ namespace arm_compute { -CPPUpsampleKernel::CPPUpsampleKernel() - : _input(nullptr), _output(nullptr), _info() +CPPUpsampleKernel::CPPUpsampleKernel() : _input(nullptr), _output(nullptr), _info() { } @@ -82,7 +82,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info) const size_t element_size = _input->info()->element_size(); // The fill value is normally 0, but for quantized types '0' corresponds to the offset - switch(_output->info()->data_type()) + switch (_output->info()->data_type()) { case DataType::QASYMM8: { @@ -102,7 +102,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info) // Create window Window window_out(window); - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { window_out.set(Window::DimX, Window::Dimension(start_width, end_width, stride_width)); window_out.set(Window::DimY, Window::Dimension(start_height, end_height, stride_height)); @@ -117,10 +117,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info) Iterator in(_input, window); Iterator out(_output, window_out); - execute_window_loop(window, [&](const Coordinates &) - { - memcpy(out.ptr(), in.ptr(), element_size); - }, - in, out); + execute_window_loop( + window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/Error.cpp b/src/core/Error.cpp index 5c8d45c987..679a93f9af 100644 --- a/src/core/Error.cpp +++ b/src/core/Error.cpp @@ -36,9 +36,10 @@ Status arm_compute::create_error(ErrorCode error_code, std::string msg) return Status(error_code, msg); } -Status arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg) +Status +arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg) { - std::array<char, 512> out{ 0 }; + std::array<char, 512> out{0}; snprintf(out.data(), out.size(), "in %s %s:%d: %s", func, file, line, msg); return Status(error_code, std::string(out.data())); } diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp index 292acf8633..2d1a13cb33 100644 --- a/src/core/GPUTarget.cpp +++ b/src/core/GPUTarget.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/GPUTarget.h" + #include "arm_compute/core/Log.h" #include <map> @@ -31,47 +32,47 @@ namespace { arm_compute::GPUTarget get_valhall_target(const std::string &version) { - if(version.find("G77") != std::string::npos) + if (version.find("G77") != std::string::npos) { return arm_compute::GPUTarget::G77; } - else if(version.find("G57") != std::string::npos) + else if (version.find("G57") != std::string::npos) { return arm_compute::GPUTarget::G57; } - if(version.find("G68") != std::string::npos) + if (version.find("G68") != std::string::npos) { return arm_compute::GPUTarget::G68; } - if(version.find("G78AE") != std::string::npos) + if (version.find("G78AE") != std::string::npos) { return arm_compute::GPUTarget::G78AE; } - if(version.find("G78") != std::string::npos) + if (version.find("G78") != std::string::npos) { return arm_compute::GPUTarget::G78; } - else if(version.find("G710") != std::string::npos) + else if (version.find("G710") != std::string::npos) { return arm_compute::GPUTarget::G710; } - else if(version.find("G610") != std::string::npos) + else if (version.find("G610") != std::string::npos) { return arm_compute::GPUTarget::G610; } - else if(version.find("G510") != std::string::npos) + else if (version.find("G510") != std::string::npos) { return arm_compute::GPUTarget::G510; } - else if(version.find("G310") != std::string::npos) + else if (version.find("G310") != std::string::npos) { return arm_compute::GPUTarget::G310; } - else if(version.find("G715") != std::string::npos) + else if (version.find("G715") != std::string::npos) { return arm_compute::GPUTarget::G715; } - else if(version.find("G615") != std::string::npos) + else if (version.find("G615") != std::string::npos) { return arm_compute::GPUTarget::G615; } @@ -83,39 +84,39 @@ arm_compute::GPUTarget get_valhall_target(const std::string &version) arm_compute::GPUTarget get_bifrost_target(const std::string &version) { - if(version.find("G71") != std::string::npos) + if (version.find("G71") != std::string::npos) { return arm_compute::GPUTarget::G71; } - else if(version.find("G72") != std::string::npos) + else if (version.find("G72") != std::string::npos) { return arm_compute::GPUTarget::G72; } - else if(version.find("G51BIG") != std::string::npos) + else if (version.find("G51BIG") != std::string::npos) { return arm_compute::GPUTarget::G51BIG; } - else if(version.find("G51LIT") != std::string::npos) + else if (version.find("G51LIT") != std::string::npos) { return arm_compute::GPUTarget::G51LIT; } - else if(version.find("G51") != std::string::npos) + else if (version.find("G51") != std::string::npos) { return arm_compute::GPUTarget::G51; } - else if(version.find("G52LIT") != std::string::npos) + else if (version.find("G52LIT") != std::string::npos) { return arm_compute::GPUTarget::G52LIT; } - else if(version.find("G52") != std::string::npos) + else if (version.find("G52") != std::string::npos) { return arm_compute::GPUTarget::G52; } - else if(version.find("G76") != std::string::npos) + else if (version.find("G76") != std::string::npos) { return arm_compute::GPUTarget::G76; } - else if(version.find("G31") != std::string::npos) + else if (version.find("G31") != std::string::npos) { return arm_compute::GPUTarget::G31; } @@ -127,15 +128,15 @@ arm_compute::GPUTarget get_bifrost_target(const std::string &version) arm_compute::GPUTarget get_midgard_target(const std::string &version) { - if(version.find("T600") != std::string::npos) + if (version.find("T600") != std::string::npos) { return arm_compute::GPUTarget::T600; } - else if(version.find("T700") != std::string::npos) + else if (version.find("T700") != std::string::npos) { return arm_compute::GPUTarget::T700; } - else if(version.find("T800") != std::string::npos) + else if (version.find("T800") != std::string::npos) { return arm_compute::GPUTarget::T800; } @@ -150,34 +151,16 @@ namespace arm_compute { const std::string &string_from_target(GPUTarget target) { - static std::map<GPUTarget, const std::string> gpu_target_map = - { - { GPUTarget::MIDGARD, "midgard" }, - { GPUTarget::BIFROST, "bifrost" }, - { GPUTarget::VALHALL, "valhall" }, - { GPUTarget::T600, "t600" }, - { GPUTarget::T700, "t700" }, - { GPUTarget::T800, "t800" }, - { GPUTarget::G71, "g71" }, - { GPUTarget::G72, "g72" }, - { GPUTarget::G51, "g51" }, - { GPUTarget::G51BIG, "g51big" }, - { GPUTarget::G51LIT, "g51lit" }, - { GPUTarget::G31, "g31" }, - { GPUTarget::G76, "g76" }, - { GPUTarget::G52, "g52" }, - { GPUTarget::G52LIT, "g52lit" }, - { GPUTarget::G77, "g77" }, - { GPUTarget::G57, "g57" }, - { GPUTarget::G78, "g78" }, - { GPUTarget::G68, "g68" }, - { GPUTarget::G78AE, "g78ae" }, - { GPUTarget::G710, "g710" }, - { GPUTarget::G610, "g610" }, - { GPUTarget::G510, "g510" }, - { GPUTarget::G310, "g310" }, - { GPUTarget::G715, "g715" }, - { GPUTarget::G615, "g615" }, + static std::map<GPUTarget, const std::string> gpu_target_map = { + {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"}, + {GPUTarget::T600, "t600"}, {GPUTarget::T700, "t700"}, {GPUTarget::T800, "t800"}, + {GPUTarget::G71, "g71"}, {GPUTarget::G72, "g72"}, {GPUTarget::G51, "g51"}, + {GPUTarget::G51BIG, "g51big"}, {GPUTarget::G51LIT, "g51lit"}, {GPUTarget::G31, "g31"}, + {GPUTarget::G76, "g76"}, {GPUTarget::G52, "g52"}, {GPUTarget::G52LIT, "g52lit"}, + {GPUTarget::G77, "g77"}, {GPUTarget::G57, "g57"}, {GPUTarget::G78, "g78"}, + {GPUTarget::G68, "g68"}, {GPUTarget::G78AE, "g78ae"}, {GPUTarget::G710, "g710"}, + {GPUTarget::G610, "g610"}, {GPUTarget::G510, "g510"}, {GPUTarget::G310, "g310"}, + {GPUTarget::G715, "g715"}, {GPUTarget::G615, "g615"}, }; return gpu_target_map[target]; @@ -189,7 +172,7 @@ GPUTarget get_target_from_name(const std::string &device_name) std::smatch name_parts; const bool found_mali = std::regex_search(device_name, name_parts, mali_regex); - if(!found_mali) + if (!found_mali) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Arm® Mali™ GPU. Target is set to default."); return GPUTarget::MIDGARD; @@ -203,22 +186,22 @@ GPUTarget get_target_from_name(const std::string &device_name) // Work-out gpu target GPUTarget gpu_target; - if(target == 'G' || is_future_gpu) + if (target == 'G' || is_future_gpu) { // Check for Valhall or Bifrost gpu_target = get_valhall_target(version); - if(gpu_target == GPUTarget::UNKNOWN) + if (gpu_target == GPUTarget::UNKNOWN) { gpu_target = get_bifrost_target(version); } // Default GPUTarget - if(gpu_target == GPUTarget::UNKNOWN) + if (gpu_target == GPUTarget::UNKNOWN) { gpu_target = GPUTarget::VALHALL; } } - else if(target == 'T') + else if (target == 'T') { gpu_target = get_midgard_target(version); } @@ -228,7 +211,7 @@ GPUTarget get_target_from_name(const std::string &device_name) } // Report in case of unknown target - if(gpu_target == GPUTarget::UNKNOWN) + if (gpu_target == GPUTarget::UNKNOWN) { ARM_COMPUTE_LOG_INFO_MSG_CORE("Arm® Mali™ Mali GPU unknown. Target is set to the default one. (BIFROST)"); return GPUTarget::BIFROST; diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp index 28e7f4c1e5..c801b097b5 100644 --- a/src/core/Helpers.cpp +++ b/src/core/Helpers.cpp @@ -25,8 +25,11 @@ namespace arm_compute { -ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape, - InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined) +ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, + const TensorShape &dst_shape, + InterpolationPolicy interpolate_policy, + SamplingPolicy sampling_policy, + bool border_undefined) { const DataLayout data_layout = src_info.data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -49,9 +52,9 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens auto valid_end_out_y = std::min<int>(std::ceil(valid_end_in_y * scale_y), dst_shape[idx_height]); // Handle valid points in case of the bi-linear interpolation - if(border_undefined) + if (border_undefined) { - switch(interpolate_policy) + switch (interpolate_policy) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -90,7 +93,7 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens } // Setup output valid region - ValidRegion valid_region{ Coordinates(), dst_shape, dst_shape.num_dimensions() }; + ValidRegion valid_region{Coordinates(), dst_shape, dst_shape.num_dimensions()}; valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x)); valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y)); @@ -109,14 +112,12 @@ const std::map<DataLayout, std::vector<DataLayoutDimension>> &get_layout_map() constexpr DataLayoutDimension D = DataLayoutDimension::DEPTH; constexpr DataLayoutDimension N = DataLayoutDimension::BATCHES; - static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map = - { - { DataLayout::NDHWC, { C, W, H, D, N } }, - { DataLayout::NCDHW, { W, H, D, C, N } }, - { DataLayout::NHWC, { C, W, H, N } }, - { DataLayout::NCHW, { W, H, C, N } } - }; + static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map = { + {DataLayout::NDHWC, {C, W, H, D, N}}, + {DataLayout::NCDHW, {W, H, D, C, N}}, + {DataLayout::NHWC, {C, W, H, N}}, + {DataLayout::NCHW, {W, H, C, N}}}; return layout_map; } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp index 832801255f..923c5f8a85 100644 --- a/src/core/IAccessWindow.cpp +++ b/src/core/IAccessWindow.cpp @@ -29,14 +29,18 @@ using namespace arm_compute; -ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const +ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, + const ValidRegion &input_valid_region) const { return compute_valid_region(window, input_valid_region, false, BorderSize(0)); } -ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const +ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, + ValidRegion input_valid_region, + bool border_undefined, + BorderSize border_size) const { - if(_info == nullptr) + if (_info == nullptr) { return input_valid_region; } @@ -45,7 +49,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va Coordinates old_anchor(anchor); TensorShape &shape = input_valid_region.shape; - if(!border_undefined) + if (!border_undefined) { border_size = BorderSize(0); } @@ -56,7 +60,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va // Additionally the valid region is shifted by the offset that is used by // the kernel to write back output values. anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x); - if(_info->num_dimensions() > 1) + if (_info->num_dimensions() > 1) { anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y); } @@ -69,15 +73,19 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va // old size is first converted into end points to compared against the // execution window. Afterwards the new end points are converted back into // a size of the region. - shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]); - if(_info->num_dimensions() > 1) + shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, + (window.x().end() - window.x().step()) * _scale_x + _width) - + anchor[0]); + if (_info->num_dimensions() > 1) { - shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]); + shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, + (window.y().end() - window.y().step()) * _scale_y + _height) - + anchor[1]); } // For higher dimensions use the intersection of the window size and the // valid region of the input - for(size_t d = 2; d < _info->num_dimensions(); ++d) + for (size_t d = 2; d < _info->num_dimensions(); ++d) { anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d])); shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]); @@ -86,9 +94,12 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va return input_valid_region; } -void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined, const BorderSize &border_size) +void AccessWindowRectangle::set_valid_region(const Window &window, + const ValidRegion &input_valid_region, + bool border_undefined, + const BorderSize &border_size) { - if(_info != nullptr) + if (_info != nullptr) { _info->set_valid_region(compute_valid_region(window, input_valid_region, border_undefined, border_size)); } @@ -97,17 +108,16 @@ void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRe bool AccessWindowRectangle::update_window_if_needed(Window &window) const { // Only update the window size if we can't use padding - if(_info == nullptr || _info->is_resizable()) + if (_info == nullptr || _info->is_resizable()) { return false; } - PaddingSize needed = get_needed_padding(window); + PaddingSize needed = get_needed_padding(window); PaddingSize available = _info->padding(); - if(needed.top <= available.top && needed.right <= available.right - && needed.bottom <= available.bottom - && needed.left <= available.left) + if (needed.top <= available.top && needed.right <= available.right && needed.bottom <= available.bottom && + needed.left <= available.left) { return false; } @@ -124,12 +134,12 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height; // Adjust window start for Y dimension - if(min_y < 0) + if (min_y < 0) { // Calculate rows available above the tensor const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]); - if(min_y < front_pad_y_available) + if (min_y < front_pad_y_available) { // Not enough padding available, need to shrink the window int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y; @@ -144,18 +154,19 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const } // Adjust window end for Y dimension - if(max_y > static_cast<int>(shape[1])) + if (max_y > static_cast<int>(shape[1])) { const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size(); // Calculate rows available below the tensor const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y; - if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y) + if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y) { // Not enough padding available, need to shrink the window - int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height; - end = std::max<int>(window.y().start(), end / _scale_y); + int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + + window.y().step() * _scale_y - _y - _height; + end = std::max<int>(window.y().start(), end / _scale_y); window.set(1, Window::Dimension(window.y().start(), end, window.y().step())); window_modified = true; @@ -170,11 +181,14 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size(); // Adjust window start for X dimension - if(min_x < 0) + if (min_x < 0) { - const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]); + const int front_pad_x_available = + -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], + stride_y - shape[0] * strides[0]) / + static_cast<int>(strides[0]); - if(min_x < front_pad_x_available) + if (min_x < front_pad_x_available) { // Not enough padding available, need to shrink the window int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x; @@ -189,15 +203,16 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const } // Adjust window end for X dimension - if(max_x > static_cast<int>(shape[0])) + if (max_x > static_cast<int>(shape[0])) { const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x; - if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x) + if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x) { // Not enough padding available, need to shrink the window - int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width; - end = std::max<int>(window.x().start(), end / _scale_x); + int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + + window.x().step() * _scale_x - _x - _width; + end = std::max<int>(window.x().start(), end / _scale_x); window.set(0, Window::Dimension(window.x().start(), end, window.x().step())); window_modified = true; @@ -212,15 +227,15 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const bool AccessWindowRectangle::update_padding_if_needed(const Window &window) { // Only update the padding if the tensor allows it - if(_info == nullptr || !_info->is_resizable()) + if (_info == nullptr || !_info->is_resizable()) { return false; } // Update strides in tensor info - return _info->extend_padding( get_needed_padding(window)); + return _info->extend_padding(get_needed_padding(window)); } -PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window)const +PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window) const { ARM_COMPUTE_ERROR_ON(_scale_x == 0); ARM_COMPUTE_ERROR_ON(_scale_y == 0); diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp index 31f1ec7a3f..fb7e095091 100644 --- a/src/core/IKernel.cpp +++ b/src/core/IKernel.cpp @@ -30,8 +30,7 @@ const Window &IKernel::window() const return _window; } -IKernel::IKernel() - : _window() +IKernel::IKernel() : _window() { // Create an empty window to make sure the children classes set the window values themselves _window.set(Window::DimX, Window::Dimension(0, 0, 1)); diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp index 2f4354cc6f..4dc8ea959b 100644 --- a/src/core/ITensor.cpp +++ b/src/core/ITensor.cpp @@ -35,7 +35,7 @@ namespace arm_compute { void ITensor::copy_from(const ITensor &src) { - if(&src == this) + if (&src == this) { return; } @@ -47,7 +47,7 @@ void ITensor::copy_from(const ITensor &src) ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels()); ARM_COMPUTE_ERROR_ON(src_info->element_size() != dst_info->element_size()); - for(size_t d = 0; d < src_info->num_dimensions(); d++) + for (size_t d = 0; d < src_info->num_dimensions(); d++) { ARM_COMPUTE_ERROR_ON(src_info->dimension(d) > dst_info->dimension(d)); } @@ -66,11 +66,7 @@ void ITensor::copy_from(const ITensor &src) const size_t line_size = src_info->element_size() * src_info->dimension(0); execute_window_loop( - win_src, [&](const Coordinates &) - { - memcpy(dst_it.ptr(), src_it.ptr(), line_size); - }, - src_it, dst_it); + win_src, [&](const Coordinates &) { memcpy(dst_it.ptr(), src_it.ptr(), line_size); }, src_it, dst_it); } #ifdef ARM_COMPUTE_ASSERTS_ENABLED @@ -87,10 +83,10 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const stream_status.copyfmt(s); // Set precision - if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default)) + if (is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default)) { int precision = io_fmt.precision; - if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full) + if (io_fmt.precision_type == IOFormatInfo::PrecisionType::Full) { precision = std::numeric_limits<float>().max_digits10; } @@ -101,7 +97,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const size_t print_width = 0; size_t print_height = 0; int start_offset = 0; - switch(io_fmt.print_region) + switch (io_fmt.print_region) { case IOFormatInfo::PrintRegion::NoPadding: print_width = this->info()->dimension(0); @@ -111,13 +107,14 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const case IOFormatInfo::PrintRegion::ValidRegion: print_width = this->info()->valid_region().shape.x(); print_height = this->info()->valid_region().shape.y(); - start_offset = this->info()->offset_element_in_bytes(Coordinates(this->info()->valid_region().anchor.x(), - this->info()->valid_region().anchor.y())); + start_offset = this->info()->offset_element_in_bytes( + Coordinates(this->info()->valid_region().anchor.x(), this->info()->valid_region().anchor.y())); break; case IOFormatInfo::PrintRegion::Full: print_width = padding.left + this->info()->dimension(0) + padding.right; print_height = padding.top + this->info()->dimension(1) + padding.bottom; - start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0]; + start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - + padding.left * strides[0]; break; default: break; @@ -129,16 +126,17 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const const uint8_t *ptr = this->buffer() + start_offset; // Start printing - for(size_t i = 0; i < slices2D; ++i) + for (size_t i = 0; i < slices2D; ++i) { // Find max_width of elements in slice to align columns int max_element_width = 0; - if(io_fmt.align_columns) + if (io_fmt.align_columns) { size_t offset = i * strides[2]; - for(size_t h = 0; h < print_height; ++h) + for (size_t h = 0; h < print_height; ++h) { - max_element_width = std::max<int>(max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width)); + max_element_width = std::max<int>( + max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width)); offset += strides[1]; } } @@ -146,7 +144,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const // Print slice { size_t offset = i * strides[2]; - for(size_t h = 0; h < print_height; ++h) + for (size_t h = 0; h < print_height; ++h) { print_consecutive_elements(s, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim); offset += strides[1]; diff --git a/src/core/ITensorPack.cpp b/src/core/ITensorPack.cpp index 90f9a45039..0f8b0824f8 100644 --- a/src/core/ITensorPack.cpp +++ b/src/core/ITensorPack.cpp @@ -27,10 +27,9 @@ namespace arm_compute { -ITensorPack::ITensorPack(std::initializer_list<PackElement> l) - : _pack() +ITensorPack::ITensorPack(std::initializer_list<PackElement> l) : _pack() { - for(auto &e : l) + for (auto &e : l) { _pack[e.id] = e; } @@ -54,7 +53,7 @@ void ITensorPack::add_const_tensor(int id, const ITensor *tensor) const ITensor *ITensorPack::get_const_tensor(int id) const { auto it = _pack.find(id); - if(it != _pack.end()) + if (it != _pack.end()) { return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor; } @@ -81,4 +80,4 @@ bool ITensorPack::empty() const { return _pack.empty(); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h index e6d0e532c8..5f4d08d0f6 100644 --- a/src/core/NEON/NEAsymm.h +++ b/src/core/NEON/NEAsymm.h @@ -26,6 +26,7 @@ #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include <arm_neon.h> namespace arm_compute @@ -90,7 +91,7 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, { const static int32x4_t zero_s32 = vdupq_n_s32(0); - if(result_shift < 0) + if (result_shift < 0) { in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); @@ -130,18 +131,13 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to U8 uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_u8 = vmaxq_u8(out_u8, min_u8); out_u8 = vminq_u8(out_u8, max_u8); @@ -170,7 +166,7 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32, int8x16_t max_s8, bool is_bounded_relu) { - if(result_shift < 0) + if (result_shift < 0) { in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift))); in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift))); @@ -204,18 +200,13 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32, in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = vmaxq_s8(out_s8, min_s8); out_s8 = vminq_s8(out_s8, max_s8); @@ -247,8 +238,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, const static int32x4_t one_s32 = vdupq_n_s32(1); // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar - int32x4x4_t res_shift_gt0 = - { + int32x4x4_t res_shift_gt0 = { vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]), vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]), vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]), @@ -260,8 +250,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]); res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]); - int32x4x4_t res_shift_lt0 = - { + int32x4x4_t res_shift_lt0 = { vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))), vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))), vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))), @@ -273,8 +262,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]); // Select result depending on shift value - const uint32x4x4_t mask_lt0 = - { + const uint32x4x4_t mask_lt0 = { #ifdef __aarch64__ vcltzq_s32(result_shift.val[0]), vcltzq_s32(result_shift.val[1]), @@ -300,18 +288,13 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = vmaxq_s8(out_s8, min_s8); out_s8 = vminq_s8(out_s8, max_s8); @@ -332,15 +315,20 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, * * @return Quantized value */ -inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, - int32_t result_shift, int32_t result_offset_after_shift_s32, - uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu) +inline uint8_t finalize_quantization(int32_t in_value, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32_t result_offset_after_shift_s32, + uint8_t min_u8, + uint8_t max_u8, + bool is_bounded_relu) { int32x4_t in_s32 = vdupq_n_s32(in_value); - if(result_shift < 0) + if (result_shift < 0) { - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + in_value = vgetq_lane_s32( + vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); } else { @@ -355,7 +343,7 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul // Bound the result uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value))); - if(is_bounded_relu) + if (is_bounded_relu) { out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8))); } @@ -375,15 +363,20 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul * * @return Quantized value */ -inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier, - int32_t result_shift, int32_t result_offset_after_shift_s32, - int8_t min_s8, int8_t max_s8, bool is_bounded_relu) +inline int8_t finalize_quantization(int32_t in_value, + int result_fixedpoint_multiplier, + int32_t result_shift, + int32_t result_offset_after_shift_s32, + int8_t min_s8, + int8_t max_s8, + bool is_bounded_relu) { int32x4_t in_s32 = vdupq_n_s32(in_value); - if(result_shift < 0) + if (result_shift < 0) { - in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); + in_value = vgetq_lane_s32( + vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0); } else { @@ -399,7 +392,7 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult // Bound the result int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value))); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8))); } @@ -416,17 +409,16 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult */ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x2_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale), - } - }; + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), + vscale), + }}; return vdequantized_input; } @@ -439,17 +431,14 @@ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationI */ inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x2_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale), - } - }; + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale), + }}; return vdequantized_input; } @@ -462,19 +451,24 @@ inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationIn */ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - } - }; + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), + vscale), + }}; return vdequantized_input; } @@ -487,19 +481,16 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization */ inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - } - }; + const float scale = qi.scale; + const int offset = qi.offset; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + }}; return vdequantized_input; } @@ -513,17 +504,22 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationI */ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset) { - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale), - } - }; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), + vscale), + }}; return vdequantized_input; } @@ -537,17 +533,14 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offs */ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset) { - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), - } - }; + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale), + }}; return vdequantized_input; } @@ -560,15 +553,12 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offse */ inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale) { - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]), - } - }; + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]), + }}; return vdequantized_input; } @@ -581,16 +571,13 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale */ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale) { - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale), - } - }; + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale), + }}; return vdequantized_input; } @@ -607,18 +594,15 @@ inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInf const int offset = qi.offset; const float32x4_t voffset = vdupq_n_f32(offset); const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), #else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), #endif //__aarch64__ - } - }; + }}; return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); } @@ -635,18 +619,15 @@ inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizat const int offset = qi.offset; const float32x4_t voffset = vdupq_n_f32(offset); const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), #else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), #endif //__aarch64__ - } - }; + }}; return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); } @@ -654,22 +635,19 @@ inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int3 { const int32x4_t voffset = vdupq_n_s32(offset); const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), - vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), - vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), - vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), #else //__aarch64__ - vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), - vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), - vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), - vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), #endif //__aarch64__ - } - }; + }}; return rf; } @@ -715,7 +693,7 @@ inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQua auto rf = vquantize_internal(qv, qi.scale, qi.offset); const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1])); const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3])); - return { pa, pb }; + return {pa, pb}; } } // namespace arm_compute diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl index ca2aea1e18..fd62fd4654 100644 --- a/src/core/NEON/NEAsymm.inl +++ b/src/core/NEON/NEAsymm.inl @@ -51,14 +51,14 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); // Convert float32 vectors to uint32 vectors #if __aarch64__ - if(round_policy == RoundingPolicy::TO_NEAREST_EVEN) + if (round_policy == RoundingPolicy::TO_NEAREST_EVEN) { A_u32x4 = vcvtnq_u32_f32(A_f32x4); B_u32x4 = vcvtnq_u32_f32(B_f32x4); C_u32x4 = vcvtnq_u32_f32(C_f32x4); D_u32x4 = vcvtnq_u32_f32(D_f32x4); } - else if(round_policy == RoundingPolicy::TO_NEAREST_UP) + else if (round_policy == RoundingPolicy::TO_NEAREST_UP) { A_u32x4 = vcvtaq_u32_f32(A_f32x4); B_u32x4 = vcvtaq_u32_f32(B_f32x4); @@ -86,7 +86,7 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8)); } -template <RoundingPolicy round_policy> +template <RoundingPolicy round_policy> inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo) { // Convert uint8 vectors to int16 vectors @@ -110,14 +110,14 @@ inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x C_f32x4 = vmlaq_f32(vo, C_f32x4, vs); D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); #if __aarch64__ - if(round_policy == RoundingPolicy::TO_NEAREST_EVEN) + if (round_policy == RoundingPolicy::TO_NEAREST_EVEN) { A_s32x4 = vcvtnq_s32_f32(A_f32x4); B_s32x4 = vcvtnq_s32_f32(B_f32x4); C_s32x4 = vcvtnq_s32_f32(C_f32x4); D_s32x4 = vcvtnq_s32_f32(D_f32x4); } - else if(round_policy == RoundingPolicy::TO_NEAREST_UP) + else if (round_policy == RoundingPolicy::TO_NEAREST_UP) { A_s32x4 = vcvtaq_s32_f32(A_f32x4); B_s32x4 = vcvtaq_s32_f32(B_f32x4); diff --git a/src/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl index 8bff9c4a8e..fb403b6d26 100644 --- a/src/core/NEON/NEFixedPoint.inl +++ b/src/core/NEON/NEFixedPoint.inl @@ -30,13 +30,7 @@ namespace arm_compute inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b) { - float32x4x2_t res = - { - { - vmaxq_f32(a.val[0], b.val[0]), - vmaxq_f32(a.val[1], b.val[1]) - } - }; + float32x4x2_t res = {{vmaxq_f32(a.val[0], b.val[0]), vmaxq_f32(a.val[1], b.val[1])}}; return res; } #endif /* DOXYGEN_SKIP_THIS */ diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl index 1cbe669373..f875917988 100644 --- a/src/core/NEON/NEMath.inl +++ b/src/core/NEON/NEMath.inl @@ -29,19 +29,16 @@ namespace arm_compute { /** Logarithm polynomial coefficients */ -const std::array<float32x4_t, 8> log_tab = -{ - { - vdupq_n_f32(-2.29561495781f), - vdupq_n_f32(-2.47071170807f), - vdupq_n_f32(-5.68692588806f), - vdupq_n_f32(-0.165253549814f), - vdupq_n_f32(5.17591238022f), - vdupq_n_f32(0.844007015228f), - vdupq_n_f32(4.58445882797f), - vdupq_n_f32(0.0141278216615f), - } -}; +const std::array<float32x4_t, 8> log_tab = {{ + vdupq_n_f32(-2.29561495781f), + vdupq_n_f32(-2.47071170807f), + vdupq_n_f32(-5.68692588806f), + vdupq_n_f32(-0.165253549814f), + vdupq_n_f32(5.17591238022f), + vdupq_n_f32(0.844007015228f), + vdupq_n_f32(4.58445882797f), + vdupq_n_f32(0.0141278216615f), +}}; /** Sin polynomial coefficients */ constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3) @@ -54,7 +51,7 @@ inline float32x4_t prefer_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { #if __ARM_FEATURE_FMA return vfmaq_f32(a, b, c); -#else // __ARM_FEATURE_FMA +#else // __ARM_FEATURE_FMA return vmlaq_f32(a, b, c); #endif // __ARM_FEATURE_FMA } @@ -73,13 +70,14 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val) { #ifdef __aarch64__ return vrndnq_f32(val); -#else // __aarch64__ +#else // __aarch64__ static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f); static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f); static const int32x4_t CONST_1_INT = vdupq_n_s32(1); const float32x4_t floor_val = vfloorq_f32(val); const float32x4_t diff = vsubq_f32(val, floor_val); - const float32x4_t fp32_upper_limit = vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U + const float32x4_t fp32_upper_limit = + vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U /* * 1. Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0). @@ -95,12 +93,13 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val) * Threshold upper limit with format |S|E(8bits)| Fraction(23bits) | = (23 + 127) << 23 (assuming positive sign): Adding 127, because 127 represents the actual zero in this format. */ - float32x4_t rounded_val = vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), - vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), - vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))), - floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); + float32x4_t rounded_val = vbslq_f32( + vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), + vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), + vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))), + floor_val, vaddq_f32(floor_val, CONST_1_FLOAT)); - float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val); + float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val); return result; #endif // __aarch64__ @@ -118,8 +117,8 @@ inline float32x2_t vinvsqrt_f32(float32x2_t x) inline float32x4_t vinvsqrtq_f32(float32x4_t x) { float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); - sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); return sqrt_reciprocal; } @@ -152,8 +151,7 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t return res; } -static const uint32_t exp_f32_coeff[] = -{ +static const uint32_t exp_f32_coeff[] = { 0x3f7ffff6, // x^1: 0x1.ffffecp-1f 0x3efffedb, // x^2: 0x1.fffdb6p-2f 0x3e2aaf33, // x^3: 0x1.555e66p-3f @@ -169,10 +167,12 @@ inline float32x4_t vexpq_f32(float32x4_t x) const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[3])); const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[4])); - const auto shift = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f - const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f - const auto neg_ln2_hi = vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f - const auto neg_ln2_lo = vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f + const auto shift = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f + const auto neg_ln2_hi = + vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f + const auto neg_ln2_lo = + vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f const auto inf = vdupq_n_f32(std::numeric_limits<float>::infinity()); const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5) @@ -224,9 +224,9 @@ inline float32x4_t vexpq_f32(float32x4_t x) #ifdef __aarch64__ inline float32x4_t verfq_f32(float32x4_t x) { - static const float erffdata[4] = { 0.278393f, 0.230389f, 0.000972f, 0.078108f }; + static const float erffdata[4] = {0.278393f, 0.230389f, 0.000972f, 0.078108f}; static const float32x4_t coeffdata = vld1q_f32(erffdata); - static const float32x4_t onev{ vdupq_n_f32(1.0f) }; + static const float32x4_t onev{vdupq_n_f32(1.0f)}; uint32x4_t selector = vcltzq_f32(x); @@ -287,10 +287,12 @@ inline float32x4_t vtanhq_f32(float32x4_t val) float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH); // x * (1 - x^2/3) if |x| < 5.e-3 or (exp2x - 1) / (exp2x + 1) otherwise - float32x4_t exp2x = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x)); - float32x4_t num = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x)); - float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num)); - float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den)); + float32x4_t exp2x = + vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x)); + float32x4_t num = + vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x)); + float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num)); + float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den)); return tanh; } @@ -456,30 +458,23 @@ inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in) inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out) { - out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), - vqmovn_u32(vcvtq_u32_f32(in2.val[0])))); - out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), - vqmovn_u32(vcvtq_u32_f32(in2.val[1])))); - out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), - vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); + out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), vqmovn_u32(vcvtq_u32_f32(in2.val[0])))); + out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), vqmovn_u32(vcvtq_u32_f32(in2.val[1])))); + out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); } inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out) { - const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), - vqmovn_u32(vcvtq_u32_f32(in.val[1]))); - const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), - vqmovn_u32(vcvtq_u32_f32(in.val[3]))); - out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); + const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1]))); + const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), vqmovn_u32(vcvtq_u32_f32(in.val[3]))); + out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); } inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out) { - const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), - vqmovn_s32(vcvtq_s32_f32(in.val[1]))); - const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), - vqmovn_s32(vcvtq_s32_f32(in.val[3]))); - out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); + const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), vqmovn_s32(vcvtq_s32_f32(in.val[1]))); + const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), vqmovn_s32(vcvtq_s32_f32(in.val[3]))); + out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high)); } template <> @@ -552,8 +547,8 @@ inline float16x4_t vinvsqrt_f16(float16x4_t x) inline float16x8_t vinvsqrtq_f16(float16x8_t x) { float16x8_t sqrt_reciprocal = vrsqrteq_f16(x); - sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); - sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); return sqrt_reciprocal; } @@ -602,8 +597,8 @@ inline float16x4_t vtanh_rational_approx_f16(float16x4_t x16) inline float16x8_t vtanhq_f16(float16x8_t x) { // Split into high/low and use rational approximation on both parts exactly - const float16x8_t tanh = vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)), - vtanh_rational_approx_f16(vget_high_f16(x))); + const float16x8_t tanh = + vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)), vtanh_rational_approx_f16(vget_high_f16(x))); // tanh(x) == sign(x) to F16 precision for |x| >= 4.508, use sign after this const float16x8_t ONE = vdupq_n_f16(1.0f); diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h index e6644577a1..ec246efc8c 100644 --- a/src/core/NEON/NESymm.h +++ b/src/core/NEON/NESymm.h @@ -25,7 +25,9 @@ #define ARM_COMPUTE_NESYMM_H #include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute @@ -49,13 +51,10 @@ using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 1 * @return Quantized values */ template <bool is_bounded_relu> -int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32, - int result_fixedpoint_multiplier, - int32_t result_shift, - int16x8_t min_s16, - int16x8_t max_s16) +int16x8_t finalize_quantization_int16( + int32x4x2_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int16x8_t min_s16, int16x8_t max_s16) { - if(result_shift < 0) + if (result_shift < 0) { in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift)); in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift)); @@ -76,7 +75,7 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32, // Convert S32 to S16 int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s16 = vmaxq_s16(out_s16, min_s16); out_s16 = vminq_s16(out_s16, max_s16); @@ -98,13 +97,14 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32, * @return Quantized values */ template <bool is_bounded_relu> -inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier, - int32_t result_shift, int16_t min_s16, int16_t max_s16) +inline int16_t finalize_quantization_int16( + int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int16_t min_s16, int16_t max_s16) { - if(result_shift < 0) + if (result_shift < 0) { - const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * static_cast<int64_t>(result_fixedpoint_multiplier); - in_value = static_cast<int32_t>((in_64 + (1 << 30)) >> 31); + const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * + static_cast<int64_t>(result_fixedpoint_multiplier); + in_value = static_cast<int32_t>((in_64 + (1 << 30)) >> 31); } else { @@ -117,7 +117,7 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi // Bound the result int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value))); - if(is_bounded_relu) + if (is_bounded_relu) { out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16))); } @@ -134,14 +134,9 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi */ inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale) { - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x2_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale) - } - }; + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x2_t vdequantized_input = {{vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)}}; return vdequantized_input; } @@ -156,18 +151,13 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale) { const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x2_t rf = - { - { + const int32x4x2_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) + vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) #else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) + vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)) #endif //__aarch64__ - } - }; + }}; return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); } @@ -180,17 +170,14 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale) */ inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const float32x4_t vscale = vdupq_n_f32(scale); - const float32x4x4_t vdequantized_input = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale), - } - }; + const float scale = qi.scale; + const float32x4_t vscale = vdupq_n_f32(scale); + const float32x4x4_t vdequantized_input = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale), + }}; return vdequantized_input; } @@ -206,24 +193,20 @@ inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQua const float scale = qi.scale; ARM_COMPUTE_ERROR_ON(scale == 0.f); const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), - vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), #else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), - vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), + vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), #endif //__aarch64__ - } - }; - const qsymm16x8x2_t res = - { + }}; + const qsymm16x8x2_t res = { vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])), vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])), }; diff --git a/src/core/NEON/SVEAsymm.h b/src/core/NEON/SVEAsymm.h index eea2627c62..a448cde475 100644 --- a/src/core/NEON/SVEAsymm.h +++ b/src/core/NEON/SVEAsymm.h @@ -26,6 +26,7 @@ #if defined(ARM_COMPUTE_ENABLE_SVE2) #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> namespace arm_compute @@ -70,10 +71,18 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, float scal const auto voffset = svdup_n_s32(offset); const auto vscale = svdup_n_f32(scale); const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale)); + svmul_f32_z(pg, + svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), + vscale), + svmul_f32_z(pg, + svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), + vscale), + svmul_f32_z(pg, + svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), + vscale), + svmul_f32_z(pg, + svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), + vscale)); return vdequantized_input; } @@ -104,10 +113,10 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale const auto voffset = svdup_n_s32(offset); const auto vscale = svdup_n_f32(scale); const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale)); + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale)); return vdequantized_input; } @@ -135,11 +144,11 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const Unifo */ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale) { - const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3))); + const svfloat32x4_t vdequantized_input = + svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3))); return vdequantized_input; } @@ -153,12 +162,12 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svflo */ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale) { - const auto vscale = svdup_n_f32(scale); - const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale)); + const auto vscale = svdup_n_f32(scale); + const svfloat32x4_t vdequantized_input = + svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale)); return vdequantized_input; } diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h index 5ada7ae0ff..6d69b330ba 100644 --- a/src/core/NEON/SVEMath.h +++ b/src/core/NEON/SVEMath.h @@ -28,6 +28,7 @@ #include "src/core/NEON/wrapper/intrinsics/svcvt.h" #include "src/core/NEON/wrapper/intrinsics/svdup_n.h" #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h" + #include <arm_sve.h> #include <array> @@ -181,9 +182,12 @@ svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b); * @return The converted integer vector */ template <typename int_vec_type> -int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3); +int_vec_type convert_float_to_int(const svfloat32_t &in_0, + const svfloat32_t &in_1, + const svfloat32_t &in_2, + const svfloat32_t &in_3); } // namespace arm_compute #include "src/core/NEON/SVEMath.inl" #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#endif /* ARM_COMPUTE_SVEMATH_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_SVEMATH_H */ diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl index 8973d0b273..b30125dcb7 100644 --- a/src/core/NEON/SVEMath.inl +++ b/src/core/NEON/SVEMath.inl @@ -32,8 +32,16 @@ namespace arm_compute { -inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t coeff_1, svfloat32_t coeff_2, svfloat32_t coeff_3, - svfloat32_t coeff_4, svfloat32_t coeff_5, svfloat32_t coeff_6, svfloat32_t coeff_7, svfloat32_t coeff_8) +inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, + svfloat32_t x, + svfloat32_t coeff_1, + svfloat32_t coeff_2, + svfloat32_t coeff_3, + svfloat32_t coeff_4, + svfloat32_t coeff_5, + svfloat32_t coeff_6, + svfloat32_t coeff_7, + svfloat32_t coeff_8) { const auto A = svmla_f32_z(pg, coeff_1, coeff_5, x); const auto B = svmla_f32_z(pg, coeff_3, coeff_7, x); @@ -45,8 +53,16 @@ inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t c return res; } -inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, svfloat16_t coeff_1, svfloat16_t coeff_2, svfloat16_t coeff_3, - svfloat16_t coeff_4, svfloat16_t coeff_5, svfloat16_t coeff_6, svfloat16_t coeff_7, svfloat16_t coeff_8) +inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, + svfloat16_t x, + svfloat16_t coeff_1, + svfloat16_t coeff_2, + svfloat16_t coeff_3, + svfloat16_t coeff_4, + svfloat16_t coeff_5, + svfloat16_t coeff_6, + svfloat16_t coeff_7, + svfloat16_t coeff_8) { const auto A = svmla_f16_z(pg, coeff_1, coeff_5, x); const auto B = svmla_f16_z(pg, coeff_3, coeff_7, x); @@ -90,15 +106,17 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) const auto c4 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[3])); const auto c5 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[4])); - const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f - const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f - const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f - const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f + const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f + const auto neg_ln2_hi = + svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f + const auto neg_ln2_lo = + svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f const auto inf = svdup_n_f32(std::numeric_limits<float>::infinity()); - const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) + const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5) const auto zero = svdup_n_f32(0.f); - const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) + const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125) // Range reduction: // e^x = 2^n * e^r @@ -114,23 +132,23 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) // (i.e. n) because the decimal part has been pushed out and lost. // * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent // in FP32 format. Left shifting z by 23 bits will result in 2^n. - const auto z = svmla_f32_z(pg, shift, x, inv_ln2); - const auto n = svsub_f32_z(pg, z, shift); - const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n + const auto z = svmla_f32_z(pg, shift, x, inv_ln2); + const auto n = svsub_f32_z(pg, z, shift); + const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32. // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance. const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi); - const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); + const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo); // Compute the truncated Taylor series of e^r. // poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5) const auto r2 = svmul_f32_z(pg, r, r); - const auto p1 = svmul_f32_z(pg, c1, r); - const auto p23 = svmla_f32_z(pg, c2, c3, r); - const auto p45 = svmla_f32_z(pg, c4, c5, r); - const auto p2345 = svmla_f32_z(pg, p23, p45, r2); + const auto p1 = svmul_f32_z(pg, c1, r); + const auto p23 = svmla_f32_z(pg, c2, c3, r); + const auto p45 = svmla_f32_z(pg, c4, c5, r); + const auto p2345 = svmla_f32_z(pg, p23, p45, r2); const auto p12345 = svmla_f32_z(pg, p1, p2345, r2); auto poly = svmla_f32_z(pg, scale, p12345, scale); @@ -213,7 +231,8 @@ inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x) auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23))); // Polynomial Approximation - auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, log_tab_7, log_tab_8); + auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, + log_tab_7, log_tab_8); // Reconstruct poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2); @@ -259,7 +278,8 @@ inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val) //Find positive or negative const auto c_v = svabs_z(pg, wrapper::svcvt_z<int32_t>(pg, svmul_z(pg, val, ipi_v))); const auto sign_v = svcmple(pg, val, wrapper::svdup_n(ScalarType(0))); - const auto odd_v = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))), wrapper::svdup_n(IntType(0))); + const auto odd_v = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))), + wrapper::svdup_n(IntType(0))); auto neg_v = sveor_z(pg, odd_v, sign_v); @@ -347,7 +367,10 @@ inline svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b) #if defined(ARM_COMPUTE_ENABLE_SVE2) template <> -inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3) +inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, + const svfloat32_t &in_1, + const svfloat32_t &in_2, + const svfloat32_t &in_3) { svuint8_t out; const auto all_true_pg = svptrue_b32(); @@ -381,7 +404,10 @@ inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const } template <> -inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3) +inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, + const svfloat32_t &in_1, + const svfloat32_t &in_2, + const svfloat32_t &in_3) { svint8_t out; const auto all_true_pg = svptrue_b32(); diff --git a/src/core/NEON/SVESymm.h b/src/core/NEON/SVESymm.h index 6808577681..288d45d979 100644 --- a/src/core/NEON/SVESymm.h +++ b/src/core/NEON/SVESymm.h @@ -28,6 +28,7 @@ #if defined(ARM_COMPUTE_ENABLE_SVE2) #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> namespace arm_compute @@ -42,8 +43,10 @@ namespace arm_compute */ inline svfloat32x2_t svdequantize_qsymm16_z(svbool_t pg, const svint16_t &qv, float scale) { - const auto vscale = svdup_n_f32(scale); - const svfloat32x2_t vdequantized_input = svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale), svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale)); + const auto vscale = svdup_n_f32(scale); + const svfloat32x2_t vdequantized_input = + svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale)); return vdequantized_input; } @@ -76,13 +79,13 @@ inline svint16_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x2_t qv, float */ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint16x2_t qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const auto vscale = svdup_n_f32(scale); - const svfloat32x4_t vdequantized_input = svcreate4_f32( - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale), - svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale)); + const float scale = qi.scale; + const auto vscale = svdup_n_f32(scale); + const svfloat32x4_t vdequantized_input = + svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale), + svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale)); return vdequantized_input; } @@ -112,4 +115,4 @@ inline svint16x2_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x4_t qv, con } // namespace arm_compute #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ -#endif // ARM_COMPUTE_NESYMM_H
\ No newline at end of file +#endif // ARM_COMPUTE_NESYMM_H diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index 108b199df7..deb89996a9 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -28,18 +28,17 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - +#include "src/core/NEON/kernels/batchnormalization/impl/list.h" #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/NEON/kernels/batchnormalization/impl/list.h" -#include "src/core/common/Registrars.h" - #include <map> namespace arm_compute @@ -52,8 +51,15 @@ struct BatchNormalizationSelectorData const CPUInfo &ci; }; using BatchNormalizationSelectorPtr = std::add_pointer<bool(const BatchNormalizationSelectorData &data)>::type; -using BatchNormalizationKernelPtr = std::add_pointer<void(ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const ITensor *, - float, ActivationLayerInfo &, const Window &)>::type; +using BatchNormalizationKernelPtr = std::add_pointer<void(ITensor *, + ITensor *, + const ITensor *, + const ITensor *, + const ITensor *, + const ITensor *, + float, + ActivationLayerInfo &, + const Window &)>::type; struct BatchNormalizationKernel { @@ -62,41 +68,32 @@ struct BatchNormalizationKernel BatchNormalizationKernelPtr ukernel; }; -static const BatchNormalizationKernel available_kernels[] = -{ +static const BatchNormalizationKernel available_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp16_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization) - }, - { - "sve_fp32_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization) - }, + {"sve_fp16_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)}, + {"sve_fp32_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)}, #endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization) - }, + {"neon_fp16_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)}, #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - { - "neon_fp32_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization) - }, + {"neon_fp32_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)}, #endif /* !defined(ARM_COMPUTE_ENABLE_NEON) */ }; const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -104,25 +101,31 @@ const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelec return nullptr; } -Status -validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(epsilon); - const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type(), CPUInfo::get() }); + const auto *uk = get_implementation(BatchNormalizationSelectorData{input->data_type(), CPUInfo::get()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - if(act_info.enabled()) + if (act_info.enabled()) { ActivationLayerInfo::ActivationFunction act = act_info.activation(); - ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && + act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + act != + ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a()); } - if(nullptr != output) + if (nullptr != output) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -131,17 +134,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var); - if(beta != nullptr) + if (beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta); } - if(gamma != nullptr) + if (gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); } - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); return Status{}; } @@ -169,10 +173,12 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win // Only compute denominator and constants once per feature map. int slice = -1; - const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; T mean = static_cast<T>(0); T var = static_cast<T>(0); @@ -186,80 +192,83 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win auto beta_vec = wrapper::vdup_n(beta, ExactTagType{}); auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{}); const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{}); - execute_window_loop(win_to_use, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - - if(slice != id.z()) + execute_window_loop( + win_to_use, + [&](const Coordinates &id) { - mean = input_mean[id.z()]; - var = input_var[id.z()]; - mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - var_vec = wrapper::vdup_n(var, ExactTagType{}); - if(input_gamma != nullptr) - { - gamma = input_gamma[id.z()]; - gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); - } - if(input_beta != nullptr) + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + + if (slice != id.z()) { - beta = input_beta[id.z()]; - beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + mean = input_mean[id.z()]; + var = input_var[id.z()]; + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + if (input_gamma != nullptr) + { + gamma = input_gamma[id.z()]; + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + } + if (input_beta != nullptr) + { + beta = input_beta[id.z()]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + // Calculate denominator + denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + denominator = wrapper::vgetlane(denominator_vec, 0); + slice = id.z(); } - // Calculate denominator - denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - denominator = wrapper::vgetlane(denominator_vec, 0); - slice = id.z(); - } - - // Perform core calculations using vector operations - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator_vec); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if(fused_activation) + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - activation_functor(res); + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator_vec); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); } - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const T numerator = input_ptr[x] - mean; - const T x_bar = numerator * denominator; - T res = beta + x_bar * gamma; - - // Perform fused activation - if(fused_activation) + // Compute left-over elements + for (; x < window_end_x; ++x) { - activation_functor(res); + const T numerator = input_ptr[x] - mean; + const T x_bar = numerator * denominator; + T res = beta + x_bar * gamma; + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + *(output_ptr + x) = res; } - - // Store results - *(output_ptr + x) = res; - } - }, - input, output); + }, + input, output); } void NEBatchNormalizationLayerKernel::configure_non_fused() { - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>; + _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, + detail::dummy<float16_t, 8>>; break; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: @@ -274,23 +283,25 @@ void NEBatchNormalizationLayerKernel::configure_non_fused() void NEBatchNormalizationLayerKernel::configure_fused() { // NCHW Fused Batched Normalization with activation functions : FP32 - static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw = - { - { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> } - }; + static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw = { + {ActivationLayerInfo::ActivationFunction::RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>}}; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // NCHW Fused Batched Normalization with activation functions : FP16 - static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw = - { - { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> } - }; + static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw = { + {ActivationLayerInfo::ActivationFunction::RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>}}; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: @@ -307,22 +318,32 @@ void NEBatchNormalizationLayerKernel::configure_fused() } NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info() + : _func(nullptr), + _input(nullptr), + _output(nullptr), + _mean(nullptr), + _var(nullptr), + _gamma(nullptr), + _beta(nullptr), + _epsilon(), + _act_info() { } -void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, - const ITensor *mean, const ITensor *var, - const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo act_info) +void NEBatchNormalizationLayerKernel::configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, - mean->info(), var->info(), - (beta != nullptr) ? beta->info() : nullptr, - (gamma != nullptr) ? gamma->info() : nullptr, - epsilon, act_info)); + mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr, + (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info)); _input = input; _output = input; @@ -334,16 +355,16 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, _act_info = act_info; const bool run_in_place = (output == nullptr) || (output == input); - if(!run_in_place) + if (!run_in_place) { _output = output; } // Configure activation function to run const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW; - if(is_nchw) + if (is_nchw) { - if(_act_info.enabled()) + if (_act_info.enabled()) { configure_fused(); } @@ -357,17 +378,21 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, Window win = calculate_max_window(*input->info(), Steps()); INEKernel::configure(win); - if(output != nullptr) + if (output != nullptr) { // Output auto initialization if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); } } -Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info)); @@ -382,13 +407,14 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW); const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW; - if(is_nchw) + if (is_nchw) { (this->*_func)(window); } else { - const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type(), CPUInfo::get() }); + const auto *uk = + get_implementation(BatchNormalizationSelectorData{_input->info()->data_type(), CPUInfo::get()}); uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window); } } diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h index 0551ace30c..2e8ff0dc9a 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -68,7 +69,13 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f, + void configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta = nullptr, + const ITensor *gamma = nullptr, + float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel * @@ -85,10 +92,14 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr, - float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta = nullptr, + const ITensorInfo *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp index 83fb5f6f51..f299bb94a4 100644 --- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp @@ -27,8 +27,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status validate_arguments_static(const ITensorInfo *input, + int block_shape_x, + int block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); @@ -65,13 +70,14 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); - const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = compute_batch_to_space_shape( + input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); + const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output); } @@ -80,7 +86,13 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in } // namespace NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel() - : _input(nullptr), _block_shape(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _block_shape_x(), _block_shape_y(), _crop_info() + : _input(nullptr), + _block_shape(nullptr), + _output(nullptr), + _data_layout(DataLayout::UNKNOWN), + _block_shape_x(), + _block_shape_y(), + _crop_info() { } @@ -99,15 +111,18 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *b ICPPKernel::configure(win); } -void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) +void NEBatchToSpaceLayerKernel::configure( + const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); + const TensorShape output_shape = compute_batch_to_space_shape( + input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); // Output auto initialization if not yet initialized auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); _input = input; _output = output; @@ -121,14 +136,19 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_sh ICPPKernel::configure(win); } -Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output)); return Status{}; } -Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info)); @@ -141,7 +161,7 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - if(_block_shape != nullptr) + if (_block_shape != nullptr) { // Retrieve the block shapes dynamically _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0))); @@ -155,31 +175,32 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - - const int x = id.x(); - const int y = id.y(); - const int z = id.z(); - // Translate x, y to uncropped version - const int x_c = x + _crop_info.left; - const int y_c = y + _crop_info.top; - - const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; - const int in_x = x_c / _block_shape_x; - const int in_y = y_c / _block_shape_y; - Coordinates input_coords{ in_x, in_y, z, in_batch }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const int x = id.x(); + const int y = id.y(); + const int z = id.z(); + // Translate x, y to uncropped version + const int x_c = x + _crop_info.left; + const int y_c = y + _crop_info.top; + + const int in_batch = + batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; + const int in_x = x_c / _block_shape_x; + const int in_y = y_c / _block_shape_y; + Coordinates input_coords{in_x, in_y, z, in_batch}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } else { @@ -188,26 +209,28 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - - const int x = id.y(); - const int y = id.z(); - - // Translate x, y to uncropped version - const int x_c = x + _crop_info.left; - const int y_c = y + _crop_info.top; - - const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; - const int in_x = x_c / _block_shape_x; - const int in_y = y_c / _block_shape_y; - Coordinates input_coords{ 0, in_x, in_y, in_batch }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size * _input->info()->dimension(0)); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const int x = id.y(); + const int y = id.z(); + + // Translate x, y to uncropped version + const int x_c = x + _crop_info.left; + const int y_c = y + _crop_info.top; + + const int in_batch = + batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; + const int in_x = x_c / _block_shape_x; + const int in_y = y_c / _block_shape_y; + Coordinates input_coords{0, in_x, in_y, in_batch}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), + element_size * _input->info()->dimension(0)); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h index 5eceee0904..d98ac621b0 100644 --- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h +++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -68,7 +69,11 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{}); + void configure(const ITensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ITensor *output, + const CropInfo &crop_info = CropInfo{}); /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -90,7 +95,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{}); + static Status validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info = CropInfo{}); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp index 677c5cddcc..a59bbd233b 100644 --- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> #include <cstdint> @@ -55,8 +56,7 @@ inline void bitwise_and(const T *__restrict input1, const T *__restrict input2, } } // namespace -NEBitwiseAndKernel::NEBitwiseAndKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +NEBitwiseAndKernel::NEBitwiseAndKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -86,8 +86,7 @@ void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2, Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); @@ -103,9 +102,7 @@ void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info) Iterator input2(_input2, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr()); - }, - input1, input2, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr()); }, input1, + input2, output); } diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp index 19b1af690a..ecd181a7af 100644 --- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,8 +51,7 @@ inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restri } } // namespace -NEBitwiseNotKernel::NEBitwiseNotKernel() - : _input(nullptr), _output(nullptr) +NEBitwiseNotKernel::NEBitwiseNotKernel() : _input(nullptr), _output(nullptr) { } @@ -77,7 +77,8 @@ void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output) // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); INEKernel::configure(win); } @@ -90,9 +91,6 @@ void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info) Iterator input(_input, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_not_U8_U8(input.ptr(), output.ptr()); - }, - input, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_not_U8_U8(input.ptr(), output.ptr()); }, input, output); } diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp index 08094fbfcf..4c906134aa 100644 --- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,7 +43,8 @@ class Coordinates; namespace { -inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) +inline void +bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) { const uint8x16_t val1 = vld1q_u8(input1); const uint8x16_t val2 = vld1q_u8(input2); @@ -51,8 +53,7 @@ inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t } } // namespace -NEBitwiseOrKernel::NEBitwiseOrKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +NEBitwiseOrKernel::NEBitwiseOrKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -82,8 +83,7 @@ void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2, Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); @@ -99,9 +99,7 @@ void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info) Iterator input2(_input2, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); - }, - input1, input2, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1, + input2, output); } diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp index fc5b38b64f..dbbed2483c 100644 --- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,7 +43,8 @@ class Coordinates; namespace { -inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) +inline void +bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) { const uint8x16_t val1 = vld1q_u8(input1); const uint8x16_t val2 = vld1q_u8(input2); @@ -51,8 +53,7 @@ inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t } } // namespace -NEBitwiseXorKernel::NEBitwiseXorKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +NEBitwiseXorKernel::NEBitwiseXorKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -82,7 +83,8 @@ void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2, AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), - AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), + output_access); INEKernel::configure(win); } @@ -96,9 +98,7 @@ void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info) Iterator input2(_input2, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); - }, - input1, input2, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1, + input2, output); } diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp index 69bfd56ce0..cb869838e2 100644 --- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp +++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp @@ -27,8 +27,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/boundingboxtransform/list.h" @@ -45,7 +46,11 @@ struct BoundingBoxTransformSelectorData }; using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type; -using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type; +using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window)>::type; struct BoundingBoxTransformKernel { @@ -54,26 +59,19 @@ struct BoundingBoxTransformKernel BoundingBoxTransformUKernelPtr ukernel; }; -static const BoundingBoxTransformKernel available_kernels[] = -{ - { - "fp32_neon_boundingboxtransform", - [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform) - }, +static const BoundingBoxTransformKernel available_kernels[] = { + {"fp32_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_boundingboxtransform", - [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform) - }, + {"fp16_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #if defined(ARM_COMPUTE_ENABLE_NEON) - { - "qu16_neon_boundingboxtransform", - [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform) - }, + {"qu16_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) }; @@ -85,9 +83,9 @@ static const BoundingBoxTransformKernel available_kernels[] = */ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -95,7 +93,10 @@ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformS return nullptr; } -Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status validate_arguments(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(boxes); @@ -108,7 +109,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0); - if(boxes->data_type() == DataType::QASYMM16) + if (boxes->data_type() == DataType::QASYMM16) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(deltas, 1, DataType::QASYMM8); const UniformQuantizationInfo deltas_qinfo = deltas->quantization_info().uniform(); @@ -120,12 +121,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas); } - if(pred_boxes->total_size() > 0) + if (pred_boxes->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas); ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2); - if(pred_boxes->data_type() == DataType::QASYMM16) + if (pred_boxes->data_type() == DataType::QASYMM16) { const UniformQuantizationInfo pred_qinfo = pred_boxes->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(pred_qinfo.scale != 0.125f); @@ -142,13 +143,19 @@ NEBoundingBoxTransformKernel::NEBoundingBoxTransformKernel() { } -void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info) +void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info)); // Configure kernel window - auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info())); + auto_init_if_empty(*pred_boxes->info(), deltas->info() + ->clone() + ->set_data_type(boxes->info()->data_type()) + .set_quantization_info(boxes->info()->quantization_info())); // Set instance variables _boxes = boxes; @@ -164,7 +171,10 @@ void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred INEKernel::configure(win); } -Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info)); return Status{}; @@ -176,7 +186,7 @@ void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &i ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() }); + const auto *uk = get_implementation(BoundingBoxTransformSelectorData{_boxes->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window); diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h index def827836c..3915994feb 100644 --- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h +++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h @@ -63,7 +63,8 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. * */ - void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); + void + configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform * @@ -77,7 +78,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info); + static Status validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp index 64da1f2262..3b53b7055f 100644 --- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp +++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,15 +45,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC); - const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); + const unsigned int channels = + input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + num_groups == channels, + "Channel shuffling with same number of groups as number of channels would be inefficient"); ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels - ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, + "The number of channels must be a multiple of the number of groups"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -72,20 +77,22 @@ void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int nu Iterator in(input, window); - execute_window_loop(window, [&](const Coordinates & id) - { - // Shuffle channel - const unsigned int curr_channel = id.x(); - const unsigned int group_id = curr_channel * rK; - const unsigned int r = group_id * K; - const unsigned int channel_id = curr_channel - r; - - // Calculate output coordinates - Coordinates out_coords = id; - out_coords.set(Window::DimX, channel_id * num_groups + group_id); - std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords)); - }, - in); + execute_window_loop( + window, + [&](const Coordinates &id) + { + // Shuffle channel + const unsigned int curr_channel = id.x(); + const unsigned int group_id = curr_channel * rK; + const unsigned int r = group_id * K; + const unsigned int channel_id = curr_channel - r; + + // Calculate output coordinates + Coordinates out_coords = id; + out_coords.set(Window::DimX, channel_id * num_groups + group_id); + std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords)); + }, + in); } void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window) { @@ -107,34 +114,35 @@ void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int nu Iterator in(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - // Shuffle channel - const unsigned int curr_channel = id.z(); - const unsigned int group_id = curr_channel * rK; - const unsigned int r = group_id * K; - const unsigned int channel_id = curr_channel - r; - - // Calculate output coordinates - Coordinates out_coords = id; - out_coords.set(Window::DimZ, channel_id * num_groups + group_id); - const uint8_t *input_ptr = in.ptr(); - uint8_t *output_ptr = output->ptr_to_element(out_coords); - - // Copy plane - for(unsigned int y = 0; y < height; ++y) + execute_window_loop( + win, + [&](const Coordinates &id) { - std::copy_n(input_ptr, row_size, output_ptr); - input_ptr += input_stride_y; - output_ptr += output_stride_y; - } - }, - in); + // Shuffle channel + const unsigned int curr_channel = id.z(); + const unsigned int group_id = curr_channel * rK; + const unsigned int r = group_id * K; + const unsigned int channel_id = curr_channel - r; + + // Calculate output coordinates + Coordinates out_coords = id; + out_coords.set(Window::DimZ, channel_id * num_groups + group_id); + const uint8_t *input_ptr = in.ptr(); + uint8_t *output_ptr = output->ptr_to_element(out_coords); + + // Copy plane + for (unsigned int y = 0; y < height; ++y) + { + std::copy_n(input_ptr, row_size, output_ptr); + input_ptr += input_stride_y; + output_ptr += output_stride_y; + } + }, + in); } } // namespace -NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() - : _input(nullptr), _output(nullptr), _num_groups() +NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr), _num_groups() { } @@ -158,7 +166,8 @@ void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *outpu INEKernel::configure(win); } -Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) +Status +NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups)); return Status{}; @@ -170,7 +179,7 @@ void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &in ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - switch(_input->info()->data_layout()) + switch (_input->info()->data_layout()) { case DataLayout::NHWC: channel_shuffle_nhwc(_input, _output, _num_groups, window); diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h index 1976302036..bc6652fd30 100644 --- a/src/core/NEON/kernels/NECol2ImKernel.h +++ b/src/core/NEON/kernels/NECol2ImKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_NECOL2IMKERNEL_H #define ARM_COMPUTE_NECOL2IMKERNEL_H -#include "src/core/NEON/INEKernel.h" - #include "arm_compute/core/Size2D.h" +#include "src/core/NEON/INEKernel.h" + namespace arm_compute { class ITensor; diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp index 94c455305c..60271fbc74 100644 --- a/src/core/NEON/kernels/NECropKernel.cpp +++ b/src/core/NEON/kernels/NECropKernel.cpp @@ -26,14 +26,15 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Window.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/utils/helpers/bit_ops.h" #include "src/cpu/kernels/crop/list.h" @@ -47,7 +48,8 @@ struct CropSelectorData }; using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type; -using CropUKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type; +using CropUKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type; struct CropUKernel { @@ -56,48 +58,23 @@ struct CropUKernel CropUKernelPtr ukernel; }; -static const CropUKernel available_kernels[] = -{ - { - "fp16_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window) - }, - { - "f32_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window) - }, - { - "u8_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window) - }, - { - "u16_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::U16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window) - }, - { - "u32_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::U32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window) - }, - { - "s8_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::S8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window) - }, - { - "s16_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window) - }, - { - "s32_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window) - }, +static const CropUKernel available_kernels[] = { + {"fp16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)}, + {"f32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)}, + {"u8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)}, + {"u16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)}, + {"u32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)}, + {"s8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)}, + {"s16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)}, + {"s32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)}, }; /** Micro-kernel selector @@ -108,9 +85,9 @@ static const CropUKernel available_kernels[] = */ const CropUKernel *get_implementation(const CropSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -119,26 +96,40 @@ const CropUKernel *get_implementation(const CropSelectorData &data) return nullptr; } -inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit) +inline void out_of_bounds_crop_window(const ITensor *output, + float *output_ptr, + float extrapolation_value, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit) { - auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag()); - int32_t x = 0; - int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0)); - float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); - for(; x <= limit - window_step_x; x += window_step_x) + auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag()); + int32_t x = 0; + int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0)); + float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); + for (; x <= limit - window_step_x; x += window_step_x) { wrapper::vstore(output_start_ptr + x, in); } - for(; x < limit; ++x) + for (; x < limit; ++x) { *(output_start_ptr + x) = extrapolation_value; } } -inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value, - const std::array<uint32_t, 2> &rows_out_of_bounds, const std::array<uint32_t, 2> &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function, - bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after, bool input_has_single_channel, bool is_width_flipped) +inline void execute_window(const ITensor *input, + const ITensor *output, + Coordinates input_offset, + float extrapolation_value, + const std::array<uint32_t, 2> &rows_out_of_bounds, + const std::array<uint32_t, 2> &cols_out_of_bounds, + NECropKernel::InBoundsCropFunction *in_bounds_crop_function, + bool is_height_flipped, + bool has_cols_in_bounds, + bool has_cols_out_of_bounds_before, + bool has_cols_out_of_bounds_after, + bool input_has_single_channel, + bool is_width_flipped) { // Output is always float. const int window_step_x = 16 / sizeof(float); @@ -159,45 +150,66 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina // |------------------------------| // Fill all output rows that have no elements that are within the input bounds with the extrapolation value. // First for the rows before the in bounds rows. - out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1)); + out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, + rows_out_of_bounds[0] * output->info()->dimension(1)); output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0); // Iterate through each row that has any elements within the input bounds. - for(uint32_t row = rows_out_of_bounds[0]; static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]); - ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2]) + for (uint32_t row = rows_out_of_bounds[0]; + static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]); + ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2]) { // Fill all elements in the row that are out of bounds with the extrapolation value. // First for the elements before the in bounds elements. - if(has_cols_out_of_bounds_before) + if (has_cols_out_of_bounds_before) { out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]); } // Copy all elements within the input bounds from the input tensor. - if(has_cols_in_bounds) + if (has_cols_in_bounds) { (*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0], - output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped); + output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, + is_width_flipped); } // Fill all elements after the in bounds elements with the extrapolation value. - if(has_cols_out_of_bounds_after) + if (has_cols_out_of_bounds_after) { - out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1)); + out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, + output->info()->dimension(1) - cols_out_of_bounds[1], + output->info()->dimension(1)); } output_ptr += output->info()->dimension(1) * output->info()->dimension(0); } // Fill all rows after the in bounds elements with the extrapolation value. - out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1)); + out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, + rows_out_of_bounds[1] * output->info()->dimension(1)); } } // namespace NECropKernel::NECropKernel() - : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds() + : _input(nullptr), + _crop_boxes(nullptr), + _box_ind(nullptr), + _output(nullptr), + _start(), + _end(), + _crop_box_ind(0), + _extrapolation_value(0), + _rows_out_of_bounds(), + _cols_out_of_bounds() { } -void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value) +void NECropKernel::configure(const ITensor *input, + const ITensor *crop_boxes, + const ITensor *box_ind, + ITensor *output, + uint32_t crop_box_ind, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), + crop_box_ind, extrapolation_value)); _input = input; _crop_boxes = crop_boxes; @@ -207,21 +219,27 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co _extrapolation_value = extrapolation_value; } -Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value) +Status NECropKernel::validate(const ITensorInfo *input, + const ITensorInfo *crop_boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + uint32_t crop_box_ind, + float extrapolation_value) { ARM_COMPUTE_UNUSED(extrapolation_value); - const auto *uk = get_implementation(CropSelectorData{ input->data_type() }); + const auto *uk = get_implementation(CropSelectorData{input->data_type()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, + DataType::F16, DataType::U32, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4); ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]); ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind); ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind); - if(output->total_size() > 0) + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -242,48 +260,53 @@ void NECropKernel::configure_output_shape() // The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers. _start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f), std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f)); - _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f), - std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f)); - const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1); + _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f), + std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f)); + const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, + abs(_end[1] - _start[1]) + 1); _output->info()->set_tensor_shape(out_shape); bool is_width_flipped = _end[0] < _start[0]; bool is_height_flipped = _end[1] < _start[1]; - if(is_height_flipped) + if (is_height_flipped) { - _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1), - static_cast<uint32_t>(_output->info()->dimension(2))) : - 0; + _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) + ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1), + static_cast<uint32_t>(_output->info()->dimension(2))) + : 0; _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]), - static_cast<uint32_t>(_output->info()->dimension(2))) : - 0; + static_cast<uint32_t>(_output->info()->dimension(2))) + : 0; } else { _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]), - static_cast<uint32_t>(_output->info()->dimension(2))) : - 0; - _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1), - static_cast<uint32_t>(_output->info()->dimension(2))) : - 0; + static_cast<uint32_t>(_output->info()->dimension(2))) + : 0; + _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) + ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1), + static_cast<uint32_t>(_output->info()->dimension(2))) + : 0; } - if(is_width_flipped) + if (is_width_flipped) { - _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1), - static_cast<uint32_t>(_output->info()->dimension(1))) : - 0; + _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) + ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1), + static_cast<uint32_t>(_output->info()->dimension(1))) + : 0; _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]), - static_cast<uint32_t>(_output->info()->dimension(1))) : - 0; + static_cast<uint32_t>(_output->info()->dimension(1))) + : 0; } else { _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]), - static_cast<uint32_t>(_output->info()->dimension(1))) : - 0; - _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1), - static_cast<uint32_t>(_output->info()->dimension(1))) : - 0; + static_cast<uint32_t>(_output->info()->dimension(1))) + : 0; + _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) + ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1), + static_cast<uint32_t>(_output->info()->dimension(1))) + : 0; } INEKernel::configure(calculate_max_window(*_output->info())); @@ -298,13 +321,18 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON(_input->info()->has_padding()); ARM_COMPUTE_ERROR_ON(_output->info()->has_padding()); - const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() }); + const auto *uk = get_implementation(CropSelectorData{_input->info()->data_type()}); uint32_t batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind)))); - Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0], - _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index); - execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1], - _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0, + Coordinates input_offset( + 0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0], + _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index); + execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, + uk->ukernel, + _end[1]<_start[1], + _cols_out_of_bounds[0] + + _cols_out_of_bounds[1]<_output->info()->dimension(1), _cols_out_of_bounds[0]> 0, + _cols_out_of_bounds[1]> 0, _start[0] <= _end[0], _end[0] < _start[0]); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h index 6c989c1d2c..da4a1b26e5 100644 --- a/src/core/NEON/kernels/NECropKernel.h +++ b/src/core/NEON/kernels/NECropKernel.h @@ -25,7 +25,7 @@ #define ARM_COMPUTE_NEON_CROP_KERNEL_H #include "arm_compute/core/Types.h" -#include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -67,7 +67,12 @@ public: * @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0. * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. */ - void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0); + void configure(const ITensor *input, + const ITensor *crop_boxes, + const ITensor *box_ind, + ITensor *output, + uint32_t crop_box_ind = 0, + float extrapolation_value = 0); /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel * @@ -82,7 +87,12 @@ public: * @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0. * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0); + static Status validate(const ITensorInfo *input, + const ITensorInfo *crop_boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + uint32_t crop_box_ind = 0, + float extrapolation_value = 0); /** Configure output tensor's shape as this can only be determined at runtime. */ void configure_output_shape(); @@ -91,7 +101,8 @@ public: void run(const Window &window, const ThreadInfo &info) override; /** Function to use for in bounds crop for the particular tensor types passed to configure() */ - using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool); + using InBoundsCropFunction = + void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool); private: const ITensor *_input; diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp index 6dcc85ec2e..de0079ee60 100644 --- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> #include <cstdint> @@ -52,12 +53,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width])); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != + (block_shape * input->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != + (block_shape * input->tensor_shape()[idx_height])); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -74,7 +77,8 @@ NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel() void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); + TensorShape output_shape = + compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); @@ -117,26 +121,27 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { Window slice_in = window.first_slice_window_2D(); do { Iterator in(_input, slice_in); - execute_window_loop(slice_in, [&](const Coordinates & id) - { - const int x = id.x(); - const int y = id.y(); - - const int z = id.z() % r; - const int out_x = x * _block_shape + (id.z() / r) % _block_shape; - const int out_y = y * _block_shape + (id.z() / r) / _block_shape; - Coordinates output_coords{ out_x, out_y, z, id[3] }; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } - while(window.slide_window_slice_2D(slice_in)); + execute_window_loop( + slice_in, + [&](const Coordinates &id) + { + const int x = id.x(); + const int y = id.y(); + + const int z = id.z() % r; + const int out_x = x * _block_shape + (id.z() / r) % _block_shape; + const int out_y = y * _block_shape + (id.z() / r) / _block_shape; + Coordinates output_coords{out_x, out_y, z, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_2D(slice_in)); } else { @@ -144,20 +149,21 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info do { Iterator in(_input, slice_in); - execute_window_loop(slice_in, [&](const Coordinates & id) - { - const int x = id.y(); - const int y = id.z(); - - const int z = id.x() % r; - const int out_x = x * _block_shape + (id.x() / r) % _block_shape; - const int out_y = y * _block_shape + (id.x() / r) / _block_shape; - Coordinates output_coords{ z, out_x, out_y, id[3] }; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } - while(window.slide_window_slice_3D(slice_in)); + execute_window_loop( + slice_in, + [&](const Coordinates &id) + { + const int x = id.y(); + const int y = id.z(); + + const int z = id.x() % r; + const int out_x = x * _block_shape + (id.x() / r) % _block_shape; + const int out_y = y * _block_shape + (id.x() / r) / _block_shape; + Coordinates output_coords{z, out_x, out_y, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_3D(slice_in)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp index 261437f07d..a5969cd497 100644 --- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp +++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -37,16 +38,19 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32); - ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -56,7 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *output, + ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_UNUSED(idx, config); @@ -68,12 +75,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr) +NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr) { } -void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config) +void NEFFTDigitReverseKernel::configure(const ITensor *input, + ITensor *output, + const ITensor *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config)); @@ -91,11 +100,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); - if(axis == 0) + if (axis == 0) { - if(is_input_complex) + if (is_input_complex) { - if(is_conj) + if (is_conj) { _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, true>; } @@ -109,11 +118,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<false, false>; } } - else if(axis == 1) + else if (axis == 1) { - if(is_input_complex) + if (is_input_complex) { - if(is_conj) + if (is_conj) { _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, true>; } @@ -133,10 +142,14 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c } } -Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); return Status{}; } @@ -159,38 +172,40 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window) std::vector<float> buffer_row_out(2 * N); std::vector<float> buffer_row_in(2 * N); - execute_window_loop(slice, [&](const Coordinates &) - { - if(is_input_complex) + execute_window_loop( + slice, + [&](const Coordinates &) { - // Load - memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float)); - - // Shuffle - for(size_t x = 0; x < 2 * N; x += 2) + if (is_input_complex) { - size_t idx = buffer_idx[x / 2]; - buffer_row_out[x] = buffer_row_in[2 * idx]; - buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]); - } - } - else - { - // Load - memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float)); + // Load + memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float)); - // Shuffle - for(size_t x = 0; x < N; ++x) + // Shuffle + for (size_t x = 0; x < 2 * N; x += 2) + { + size_t idx = buffer_idx[x / 2]; + buffer_row_out[x] = buffer_row_in[2 * idx]; + buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]); + } + } + else { - size_t idx = buffer_idx[x]; - buffer_row_out[2 * x] = buffer_row_in[idx]; + // Load + memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float)); + + // Shuffle + for (size_t x = 0; x < N; ++x) + { + size_t idx = buffer_idx[x]; + buffer_row_out[2 * x] = buffer_row_in[idx]; + } } - } - // Copy back - memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float)); - }, - in, out); + // Copy back + memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float)); + }, + in, out); } template <bool is_input_complex, bool is_conj> @@ -215,39 +230,41 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window) const size_t stride_z = _input->info()->strides_in_bytes()[2]; const size_t stride_w = _input->info()->strides_in_bytes()[3]; - execute_window_loop(slice, [&](const Coordinates & id) - { - auto *out_ptr = reinterpret_cast<float *>(out.ptr()); - auto *in_ptr = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w); - const size_t y_shuffled = buffer_idx[id.y()]; - - if(is_input_complex) + execute_window_loop( + slice, + [&](const Coordinates &id) { - // Shuffle the entire row into the output - memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float)); + auto *out_ptr = reinterpret_cast<float *>(out.ptr()); + auto *in_ptr = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w); + const size_t y_shuffled = buffer_idx[id.y()]; - // Conjugate if necessary - if(is_conj) + if (is_input_complex) { - for(size_t x = 0; x < 2 * Nx; x += 2) + // Shuffle the entire row into the output + memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float)); + + // Conjugate if necessary + if (is_conj) { - out_ptr[x + 1] = -out_ptr[x + 1]; + for (size_t x = 0; x < 2 * Nx; x += 2) + { + out_ptr[x + 1] = -out_ptr[x + 1]; + } } } - } - else - { - // Shuffle the entire row into the buffer - memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float)); - - // Copy the buffer to the output, with a zero imaginary part - for(size_t x = 0; x < 2 * Nx; x += 2) + else { - out_ptr[x] = buffer_row[x / 2]; + // Shuffle the entire row into the buffer + memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float)); + + // Copy the buffer to the output, with a zero imaginary part + for (size_t x = 0; x < 2 * Nx; x += 2) + { + out_ptr[x] = buffer_row[x / 2]; + } } - } - }, - out); + }, + out); } void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h index f436c364b2..ecf85ebc98 100644 --- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h +++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -70,7 +71,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp index 44c841f626..4b58a7b9ac 100644 --- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp +++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp @@ -28,10 +28,11 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/traits.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/traits.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "support/ToolchainSupport.h" #include <arm_neon.h> @@ -70,7 +71,7 @@ float32x2_t c_mul_neon(float32x2_t a, float32x2_t b) { using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type; - const float32x2_t mask = { -1.0, 1.0 }; + const float32x2_t mask = {-1.0, 1.0}; const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); @@ -88,7 +89,7 @@ float32x2_t c_mul_neon_img(float32x2_t a, float img_constant) const float a_r = wrapper::vgetlane(a, 0); const float a_i = wrapper::vgetlane(a, 1); - const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant }); + const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant}); return out; } @@ -100,7 +101,8 @@ float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_ return wrapper::vadd(t2, e); } -float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7) +float32x2_t reduce_sum_7( + float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7) { const auto t0 = wrapper::vadd(x1, x2); const auto t1 = wrapper::vadd(x3, x4); @@ -111,7 +113,14 @@ float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32 return wrapper::vadd(t00, t01); } -float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8) +float32x2_t reduce_sum_8(float32x2_t x1, + float32x2_t x2, + float32x2_t x3, + float32x2_t x4, + float32x2_t x5, + float32x2_t x6, + float32x2_t x7, + float32x2_t x8) { const auto t0 = wrapper::vadd(x1, x2); const auto t1 = wrapper::vadd(x3, x4); @@ -141,15 +150,21 @@ void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, x = wrapper::vadd(a, b); x = wrapper::vadd(x, c); - const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c)); - const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c)); + const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c)); + const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c)); y = z = wrapper::vsub(a, v1); y = wrapper::vadd(y, v2); z = wrapper::vsub(z, v2); } -void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3) +void fft_4(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3) { float32x2_t a = x1; float32x2_t b = c_mul_neon(w, x2); @@ -173,7 +188,15 @@ void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, c x4 = wrapper::vadd(x41, x42); } -void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4) +void fft_5(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3, + const float32x2_t &w4) { const auto a = x1; const auto b = c_mul_neon(w, x2); @@ -181,25 +204,25 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto d = c_mul_neon(w3, x4); const auto e = c_mul_neon(w4, x5); - const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b); - const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b); - const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b); + const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b); + const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b); + const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b); + const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b); - const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c); - const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c); - const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c); + const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c); + const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c); + const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c); + const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c); - const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d); - const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d); + const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d); + const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d); + const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d); + const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d); - const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e); - const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e); - const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e); + const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e); + const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e); + const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e); + const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e); x1 = reduce_sum_5(a, b, c, d, e); x2 = reduce_sum_5(a, b0, c0, d0, e0); @@ -208,9 +231,19 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f x5 = reduce_sum_5(a, b3, c3, d3, e3); } -void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, +void fft_7(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + float32x2_t &x6, + float32x2_t &x7, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3, const float32x2_t &w4, - const float32x2_t &w5, const float32x2_t &w6) + const float32x2_t &w5, + const float32x2_t &w6) { const auto a = x1; const auto b = c_mul_neon(w, x2); @@ -220,47 +253,47 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto f = c_mul_neon(w5, x6); const auto g = c_mul_neon(w6, x7); - const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b); - const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b); - const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b); - const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b); - const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b); - - const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c); - const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c); - const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c); - const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c); - const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c); - - const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d); - const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d); - const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d); - const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d); - - const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e); - const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e); - const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e); - const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e); - const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e); - - const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f); - const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f); - const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f); - const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f); - const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f); - const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f); - - const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g); - const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g); - const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g); - const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g); - const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g); - const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g); + const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b); + const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b); + const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b); + const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b); + const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b); + const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b); + + const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c); + const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c); + const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c); + const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c); + const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c); + const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c); + + const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d); + const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d); + const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d); + const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d); + const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d); + const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d); + + const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e); + const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e); + const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e); + const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e); + const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e); + const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e); + + const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f); + const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f); + const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f); + const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f); + const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f); + const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f); + + const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g); + const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g); + const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g); + const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g); + const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g); + const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g); x1 = reduce_sum_7(a, b, c, d, e, f, g); x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0); @@ -271,9 +304,20 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5); } -void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2, +void fft_8(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + float32x2_t &x6, + float32x2_t &x7, + float32x2_t &x8, + const float32x2_t &w, + const float32x2_t &w2, const float32x2_t &w3, - const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6, + const float32x2_t &w4, + const float32x2_t &w5, + const float32x2_t &w6, const float32x2_t &w7) { const auto a = x1; @@ -285,61 +329,61 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto g = c_mul_neon(w6, x7); const auto h = c_mul_neon(w7, x8); - const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b); - const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b); - const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b); - const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b); - const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b); - const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b); - - const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c); - const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c); - const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c); - const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c); - const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c); - const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c); - - const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d); - const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d); - const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d); - const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d); - const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d); - - const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e); - - const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f); - const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f); - const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f); - const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f); - const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f); - const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f); - const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f); - - const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g); - const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g); - const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g); - const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g); - const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g); - const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g); - const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g); - - const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h); - const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h); - const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h); - const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h); - const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h); - const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h); - const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h); + const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b); + const auto b1 = c_mul_neon(float32x2_t{0, -1}, b); + const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b); + const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b); + const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b); + const auto b5 = c_mul_neon(float32x2_t{0, 1}, b); + const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b); + + const auto c0 = c_mul_neon(float32x2_t{0, -1}, c); + const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c); + const auto c2 = c_mul_neon(float32x2_t{0, 1}, c); + const auto c3 = c_mul_neon(float32x2_t{1, 0}, c); + const auto c4 = c_mul_neon(float32x2_t{0, -1}, c); + const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c); + const auto c6 = c_mul_neon(float32x2_t{0, 1}, c); + + const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d); + const auto d1 = c_mul_neon(float32x2_t{0, 1}, d); + const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d); + const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d); + const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d); + const auto d5 = c_mul_neon(float32x2_t{0, -1}, d); + const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d); + + const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e1 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e3 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e5 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e); + + const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f); + const auto f1 = c_mul_neon(float32x2_t{0, -1}, f); + const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f); + const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f); + const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f); + const auto f5 = c_mul_neon(float32x2_t{0, 1}, f); + const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f); + + const auto g0 = c_mul_neon(float32x2_t{0, 1}, g); + const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g); + const auto g2 = c_mul_neon(float32x2_t{0, -1}, g); + const auto g3 = c_mul_neon(float32x2_t{1, 0}, g); + const auto g4 = c_mul_neon(float32x2_t{0, 1}, g); + const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g); + const auto g6 = c_mul_neon(float32x2_t{0, -1}, g); + + const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h); + const auto h1 = c_mul_neon(float32x2_t{0, 1}, h); + const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h); + const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h); + const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h); + const auto h5 = c_mul_neon(float32x2_t{0, -1}, h); + const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h); x1 = reduce_sum_8(a, b, c, d, e, f, g, h); x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0); @@ -352,18 +396,19 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f } template <bool first_stage> -void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_2_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - auto a = float32x2_t{ 0, 0 }; - auto b = float32x2_t{ 0, 0 }; + auto a = float32x2_t{0, 0}; + auto b = float32x2_t{0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); a = wrapper::vgetlow(ab); @@ -379,7 +424,7 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_2(a, b, w); // Write outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); } @@ -394,12 +439,20 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_2_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -418,20 +471,21 @@ void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_3_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { // Load inputs - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - if(first_stage) + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + if (first_stage) { const auto ab = wrapper::vloadq(in + k); a = wrapper::vgetlow(ab); @@ -447,7 +501,7 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_3(a, b, c, w, w2); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); } @@ -462,14 +516,22 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_3_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -489,21 +551,22 @@ void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_4_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); const auto w3 = c_mul_neon(w2, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - if(first_stage) + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -524,7 +587,7 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_4(a, b, c, d, w, w2, w3); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -542,15 +605,23 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_4_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); const auto w3 = c_mul_neon(w2, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -572,25 +643,26 @@ void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_5_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); const float32x2_t w4 = c_mul_neon(w3, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -613,7 +685,7 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_5(a, b, c, d, e, w, w2, w3, w4); // Store outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -632,16 +704,24 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_5_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); const float32x2_t w4 = c_mul_neon(w3, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -666,10 +746,11 @@ void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_7_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -677,18 +758,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w5 = c_mul_neon(w4, w); const float32x2_t w6 = c_mul_neon(w5, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; - float32x2_t f = { 0, 0 }; - float32x2_t g = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; + float32x2_t f = {0, 0}; + float32x2_t g = {0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -715,7 +796,7 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -737,10 +818,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_7_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -748,7 +837,7 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w5 = c_mul_neon(w4, w); const float32x2_t w6 = c_mul_neon(w5, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -777,10 +866,11 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template <bool first_stage> -void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_8_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -789,20 +879,20 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w6 = c_mul_neon(w5, w); const float32x2_t w7 = c_mul_neon(w6, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { // Load inputs - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; - float32x2_t f = { 0, 0 }; - float32x2_t g = { 0, 0 }; - float32x2_t h = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; + float32x2_t f = {0, 0}; + float32x2_t g = {0, 0}; + float32x2_t h = {0, 0}; // Base-case prime transform - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -834,7 +924,7 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7); // Store outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -858,10 +948,18 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_8_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -870,7 +968,7 @@ void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w6 = c_mul_neon(w5, w); const float32x2_t w7 = c_mul_neon(w6, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -908,7 +1006,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_UNUSED(config); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -917,11 +1015,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) { ARM_COMPUTE_UNUSED(config); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output, *input); } @@ -942,7 +1041,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo // FFT table axis 0: [radix, first_stage] static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0; - if(fft_table_axis0.empty()) + if (fft_table_axis0.empty()) { fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>; fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>; @@ -967,7 +1066,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo // FFT table axis 1: [radix, first_stage] static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1; - if(fft_table_axis1.empty()) + if (fft_table_axis1.empty()) { fft_table_axis1[2] = &fft_radix_2_axes_1; fft_table_axis1[3] = &fft_radix_3_axes_1; @@ -985,12 +1084,13 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT ARM_COMPUTE_ERROR_ON_NULLPTR(input); // Output auto inizialitation if not yet initialized - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), *input->info()->clone()); } - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); _input = input; _output = (output == nullptr) ? input : output; @@ -998,7 +1098,7 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT _axis = config.axis; _radix = config.radix; - switch(config.axis) + switch (config.axis) { case 0: set_radix_stage_axis0(config); @@ -1012,26 +1112,28 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT } // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config); + auto win_config = + validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } -Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config) +Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const FFTRadixStageKernelInfo &config) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (run_in_place) ? nullptr : output->clone().get(), - config) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config) + .first); return Status{}; } std::set<unsigned int> NEFFTRadixStageKernel::supported_radix() { - return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 }; + return std::set<unsigned int>{2, 3, 4, 5, 7, 8}; } void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info) @@ -1049,28 +1151,32 @@ void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info) // Precompute FFT constants const unsigned int NxRadix = _radix * _Nx; const float alpha = 2.0f * kPi / float(NxRadix); - const float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + const float32x2_t w_m{cosf(alpha), -sinf(alpha)}; - if(_axis == 0) + if (_axis == 0) { const unsigned int N = _input->info()->dimension(0); - execute_window_loop(input_window, [&](const Coordinates &) - { - _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N); - }, - in, out); + execute_window_loop( + input_window, + [&](const Coordinates &) { + _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, + N); + }, + in, out); } else { const unsigned int N = _input->info()->dimension(0); const unsigned int M = _input->info()->dimension(1); - execute_window_loop(input_window, [&](const Coordinates &) - { - _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M, - _input->info()->padding().right + _input->info()->padding().left, - _output->info()->padding().right + _output->info()->padding().left); - }, - in, out); + execute_window_loop( + input_window, + [&](const Coordinates &) + { + _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, + M, _input->info()->padding().right + _input->info()->padding().left, + _output->info()->padding().right + _output->info()->padding().left); + }, + in, out); } ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h index 2291a1068c..54f32efa23 100644 --- a/src/core/NEON/kernels/NEFFTRadixStageKernel.h +++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/NEON/INEKernel.h" #include <arm_neon.h> @@ -92,8 +93,17 @@ private: void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config); void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config); - using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>; - using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int, unsigned int, unsigned int)>; + using FFTFunctionPointerAxis0 = + std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>; + using FFTFunctionPointerAxis1 = std::function<void(float *, + float *, + unsigned int, + unsigned int, + const float32x2_t &, + unsigned int, + unsigned int, + unsigned int, + unsigned int)>; FFTFunctionPointerAxis0 _func_0; FFTFunctionPointerAxis1 _func_1; diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp index 5ec330bebc..9fe561fc59 100644 --- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp +++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp @@ -28,9 +28,10 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> @@ -41,8 +42,8 @@ namespace void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale) { const auto a = wrapper::vload(c_in); - auto b = wrapper::vdiv(a, float32x2_t{ scale, scale }); - if(is_conjugate) + auto b = wrapper::vdiv(a, float32x2_t{scale, scale}); + if (is_conjugate) { const float img_part = wrapper::vgetlane(b, 1); b = wrapper::vsetlane(-img_part, b, 1); @@ -56,7 +57,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -71,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen // Configure kernel window Window win = calculate_max_window(*input, Steps()); - if(output != nullptr) + if (output != nullptr) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, *input->clone()); @@ -126,10 +127,10 @@ void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info) Iterator in(_input, input_window); Iterator out(_run_in_place ? _input : _output, input_window); - execute_window_loop(window, [&](const Coordinates &) - { - scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &) + { scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale); }, + in, out); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h index 24a19f98ba..608cf5ea34 100644 --- a/src/core/NEON/kernels/NEFFTScaleKernel.h +++ b/src/core/NEON/kernels/NEFFTScaleKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H #define ARM_COMPUTE_NEFFTSCALEKERNEL_H -#include "src/core/NEON/INEKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/NEON/INEKernel.h" + namespace arm_compute { // Forward declarations diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp index 1c7c1f9763..00b0c0ae8d 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.cpp +++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp @@ -30,14 +30,19 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute { namespace { -inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value) +inline void fill_constant_value_single_channel_special(ITensor *tensor, + const Window &window, + unsigned int right, + unsigned int bottom, + const PixelValue &constant_border_value) { float border_value; constant_border_value.get(border_value); @@ -52,39 +57,43 @@ inline void fill_constant_value_single_channel_special(ITensor *tensor, const Wi Iterator vertical_it(tensor, vertical); - execute_window_loop(vertical, [&](const Coordinates &) - { - const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset()); + execute_window_loop( + vertical, + [&](const Coordinates &) + { + const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset()); - // Fill left and right borders - *(row_start - 1) = border_value; - std::fill_n(row_start + width, right, border_value); - }, - vertical_it); + // Fill left and right borders + *(row_start - 1) = border_value; + std::fill_n(row_start + width, right, border_value); + }, + vertical_it); // Top and bottom border Iterator plane_it(tensor, window); // Iterate over all XY planes - execute_window_loop(window, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + plane_it.offset(); - // Top border - const auto row_start = reinterpret_cast<float *>(base_addr - stridey); - // Fill top rows including left/right borders - std::fill_n(row_start - 1, 1 + width + right, border_value); - - // Bottom border - const unsigned low_border_size = height + bottom; - for(unsigned int i = height; i < low_border_size; ++i) + execute_window_loop( + window, + [&](const Coordinates &) { - const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey); - - // Fill bottom rows including left/right borders + uint8_t *base_addr = start_valid_region + plane_it.offset(); + // Top border + const auto row_start = reinterpret_cast<float *>(base_addr - stridey); + // Fill top rows including left/right borders std::fill_n(row_start - 1, 1 + width + right, border_value); - } - }, - plane_it); + + // Bottom border + const unsigned low_border_size = height + bottom; + for (unsigned int i = height; i < low_border_size; ++i) + { + const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey); + + // Fill bottom rows including left/right borders + std::fill_n(row_start - 1, 1 + width + right, border_value); + } + }, + plane_it); } } // namespace @@ -93,14 +102,20 @@ NEFillBorderKernel::NEFillBorderKernel() { } -void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorderKernel::configure(ITensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); _tensor = tensor; configure(tensor->info(), border_size, border_mode, constant_border_value); } -void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorderKernel::configure(ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. @@ -124,7 +139,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); // If there is no border: early exit - if(_border_size.empty()) + if (_border_size.empty()) { return; } @@ -132,13 +147,14 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_mode) + switch (_mode) { case BorderMode::CONSTANT: { - if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32) + if (_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32) { - fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value); + fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, + _constant_border_value); } else { @@ -176,46 +192,56 @@ void NEFillBorderKernel::fill_replicate_single_channel(const Window &window) Iterator vertical_it(_tensor, vertical); - execute_window_loop(vertical, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + vertical_it.offset(); - // Fill left and right borders - for(unsigned int i = 0; i < _border_size.left; ++i) + execute_window_loop( + vertical, + [&](const Coordinates &) { - std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), element_size); - } + uint8_t *base_addr = start_valid_region + vertical_it.offset(); + // Fill left and right borders + for (unsigned int i = 0; i < _border_size.left; ++i) + { + std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), + element_size); + } - for(unsigned int i = 0; i < _border_size.right; ++i) - { - std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size); - } - }, - vertical_it); + for (unsigned int i = 0; i < _border_size.right; ++i) + { + std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, + element_size); + } + }, + vertical_it); // Top and bottom border Iterator plane_it(_tensor, window); // Iterate over all XY planes - execute_window_loop(window, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + plane_it.offset(); - // Top border - for(int i = -_border_size.top; i < 0; ++i) + execute_window_loop( + window, + [&](const Coordinates &) { - // Copy top rows including left/right borders - std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) - _border_size.left * element_size, - base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size); - } + uint8_t *base_addr = start_valid_region + plane_it.offset(); + // Top border + for (int i = -_border_size.top; i < 0; ++i) + { + // Copy top rows including left/right borders + std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) - + _border_size.left * element_size, + base_addr - _border_size.left * element_size, + (_border_size.left + width + _border_size.right) * element_size); + } - // Bottom border - for(unsigned int i = height; i < height + _border_size.bottom; ++i) - { - // Copy bottom rows including left/right borders - std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, - base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size); - } - }, - plane_it); + // Bottom border + for (unsigned int i = height; i < height + _border_size.bottom; ++i) + { + // Copy bottom rows including left/right borders + std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, + base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - + _border_size.left * element_size, + (_border_size.left + width + _border_size.right) * element_size); + } + }, + plane_it); } void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window) @@ -232,50 +258,57 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window Iterator vertical_it(_tensor, vertical); - execute_window_loop(vertical, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + vertical_it.offset(); - // Fill left and right borders - for(unsigned int i = 0; i < _border_size.left; ++i) + execute_window_loop( + vertical, + [&](const Coordinates &) { - std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, element_size); - } + uint8_t *base_addr = start_valid_region + vertical_it.offset(); + // Fill left and right borders + for (unsigned int i = 0; i < _border_size.left; ++i) + { + std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, + element_size); + } - for(unsigned int i = 0; i < _border_size.right; ++i) - { - std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size); - } - }, - vertical_it); + for (unsigned int i = 0; i < _border_size.right; ++i) + { + std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size); + } + }, + vertical_it); // Top and bottom border Iterator plane_it(_tensor, window); // Iterate over all XY planes - execute_window_loop(window, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + plane_it.offset(); - // Top border - for(int i = -_border_size.top; i < 0; ++i) + execute_window_loop( + window, + [&](const Coordinates &) { - // Fill top rows including left/right borders - for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + uint8_t *base_addr = start_valid_region + plane_it.offset(); + // Top border + for (int i = -_border_size.top; i < 0; ++i) { - std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size); + // Fill top rows including left/right borders + for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + { + std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, + &_constant_border_value, element_size); + } } - } - // Bottom border - const unsigned low_border_size = height + _border_size.bottom; - for(unsigned int i = height; i < low_border_size; ++i) - { - // Fill bottom rows including left/right borders - for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + // Bottom border + const unsigned low_border_size = height + _border_size.bottom; + for (unsigned int i = height; i < low_border_size; ++i) { - std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size); + // Fill bottom rows including left/right borders + for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + { + std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, + &_constant_border_value, element_size); + } } - } - }, - plane_it); + }, + plane_it); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h index 2c851583ed..aaad108bfa 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.h +++ b/src/core/NEON/kernels/NEFillBorderKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -64,7 +65,10 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. * */ - void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ITensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Initialise the function. * * @note This kernel fills the borders within the XY-planes. @@ -75,7 +79,10 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. * */ - void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp index 51a69046a9..cbe5136fb1 100644 --- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" -#include "src/cpu/kernels/fuse_batch_normalization/list.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -30,12 +29,14 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/common/cpuinfo/CpuIsaInfo.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/fuse_batch_normalization/list.h" #include <map> @@ -52,8 +53,16 @@ struct FuseBatchNormalizeSelectorData }; using FBNSelectorPtr = std::add_pointer<bool(const FuseBatchNormalizeSelectorData &data)>::type; -using FBNUKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, ITensor *, - const ITensor *, const ITensor *, const ITensor *, const ITensor *, float, const Window &)>::type; +using FBNUKernelPtr = std::add_pointer<void(const ITensor *, + const ITensor *, + ITensor *, + ITensor *, + const ITensor *, + const ITensor *, + const ITensor *, + const ITensor *, + float, + const Window &)>::type; struct FBNUKernel { @@ -62,73 +71,63 @@ struct FBNUKernel FBNUKernelPtr ukernel; }; -static const FBNUKernel available_kernels[] = -{ - { - "fused_batch_normalization_conv_NHWC_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16) - }, - { - "fused_batch_normalization_conv_NCHW_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16) - }, - { - "fused_batch_normalization_dwc_NHWC_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16) - }, - { - "fused_batch_normalization_dwc_NCHW_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16) - }, - { - "fused_batch_normalization_conv_NHWC_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32) - }, - { - "fused_batch_normalization_conv_NCHW_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32) - }, - { - "fused_batch_normalization_dwc_NHWC_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32) - }, - { - "fused_batch_normalization_dwc_NCHW_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32) - } -}; +static const FBNUKernel available_kernels[] = { + {"fused_batch_normalization_conv_NHWC_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)}, + {"fused_batch_normalization_conv_NCHW_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)}, + {"fused_batch_normalization_dwc_NHWC_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)}, + {"fused_batch_normalization_dwc_NCHW_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)}, + {"fused_batch_normalization_conv_NHWC_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)}, + {"fused_batch_normalization_conv_NCHW_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)}, + {"fused_batch_normalization_dwc_NHWC_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)}, + {"fused_batch_normalization_dwc_NCHW_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)}}; /** Micro-kernel selector * @@ -140,9 +139,9 @@ static const FBNUKernel available_kernels[] = */ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -150,10 +149,16 @@ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data) return nullptr; } -Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status validate_arguments(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_UNUSED(epsilon); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); @@ -164,43 +169,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr); ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1); - if(fbn_type == FuseBatchNormalizationType::CONVOLUTION) + if (fbn_type == FuseBatchNormalizationType::CONVOLUTION) { ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0)); } else { - const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); + const size_t channel_idx = + get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0)); } // Validate bias - if(input_bias != nullptr) + if (input_bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias); } // Validate beta - if(bn_beta != nullptr) + if (bn_beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta); } // Validate gamma - if(bn_gamma != nullptr) + if (bn_gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma); } // Validate output weights - if(fused_weights != nullptr && fused_weights->total_size() != 0) + if (fused_weights != nullptr && fused_weights->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights); } // Validate output bias - if(fused_bias != nullptr && fused_bias->total_size() != 0) + if (fused_bias != nullptr && fused_bias->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias); @@ -212,15 +218,31 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b } // namespace NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel() - : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(), - _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr) + : _input_weights(nullptr), + _input_bias(nullptr), + _bn_mean(nullptr), + _bn_var(nullptr), + _bn_gamma(nullptr), + _bn_beta(nullptr), + _fused_weights(nullptr), + _fused_bias(nullptr), + _epsilon(), + _run_in_place_weights(false), + _run_in_place_bias(false), + _func(nullptr) { } -void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, - ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); @@ -238,27 +260,27 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con _run_in_place_bias = (fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias); // Auto initialize outputs - if(_fused_weights != nullptr) + if (_fused_weights != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone()); } - if(_fused_bias != nullptr) + if (_fused_bias != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone()); } // Validate arguments - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(), - (fused_weights != nullptr) ? fused_weights->info() : nullptr, - (fused_bias != nullptr) ? fused_bias->info() : nullptr, - (input_bias != nullptr) ? input_bias->info() : nullptr, - (bn_beta != nullptr) ? bn_beta->info() : nullptr, - (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, - epsilon, fbn_type)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_weights->info(), bn_mean->info(), bn_var->info(), + (fused_weights != nullptr) ? fused_weights->info() : nullptr, + (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr, + (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon, + fbn_type)); - const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa() }); + const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ + input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); _func = uk->ukernel; @@ -268,12 +290,19 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con INEKernel::configure(win); } -Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); return Status{}; } @@ -284,6 +313,7 @@ void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window); + (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, + window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h index ee767b01c8..f23280d55a 100644 --- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h +++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h @@ -66,9 +66,16 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION. */ - void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias = nullptr, + const ITensor *bn_beta = nullptr, + const ITensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel * * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC @@ -86,10 +93,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + static Status validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias = nullptr, + const ITensorInfo *bn_beta = nullptr, + const ITensorInfo *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -107,8 +120,16 @@ private: bool _run_in_place_weights; bool _run_in_place_bias; - using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window); + using FuseBatchNormFunction = void(const ITensor *input_weights, + const ITensor *input_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window); FuseBatchNormFunction *_func; }; diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp index 11332ffac8..f1d457d399 100644 --- a/src/core/NEON/kernels/NEGatherKernel.cpp +++ b/src/core/NEON/kernels/NEGatherKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,20 +43,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - if(axis < 0) + if (axis < 0) { axis += input->num_dimensions(); } ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions())); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > Coordinates::num_max_dimensions); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > + Coordinates::num_max_dimensions); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->tensor_shape(), indices->tensor_shape(), axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } @@ -81,23 +84,23 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info) const auto idx_info = _indices->info(); const auto dst_info = _output->info(); - const auto num_dims = dst_info->num_dimensions(); + const auto num_dims = dst_info->num_dimensions(); const auto chunk_stride = src_info->strides_in_bytes()[_axis]; const auto window_start_x = window.x().start(); - const auto window_end_x = window.x().end(); - auto window_size_x = src_info->element_size(); + const auto window_end_x = window.x().end(); + auto window_size_x = src_info->element_size(); const auto idx_limit = static_cast<TIndex>(src_info->tensor_shape()[_axis]); - if(_axis != 0) + if (_axis != 0) { dst_win.set(0, Window::Dimension(window_start_x, window_start_x + 1, 1)); window_size_x *= window_end_x - window_start_x; } // Compute source and index tensors window based on the output window. - auto src_win = dst_win; + auto src_win = dst_win; Window idx_win; for (size_t i = 0; i < idx_info->num_dimensions(); ++i) @@ -109,22 +112,27 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info) // Use the custom strides to access all three tensors using the same loop. Iterator src_it(num_dims, _src_it_strides, _input->buffer(), src_info->offset_first_element_in_bytes(), src_win); Iterator idx_it(num_dims, _idx_it_strides, _indices->buffer(), idx_info->offset_first_element_in_bytes(), idx_win); - Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), dst_info->offset_first_element_in_bytes(), dst_win); - - execute_window_loop(dst_win, [&](const Coordinates &) { - const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr()); - - if(idx >= 0 && idx < idx_limit) - { - const auto src_ptr = src_it.ptr() + idx * chunk_stride; + Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), + dst_info->offset_first_element_in_bytes(), dst_win); - std::copy_n(src_ptr, window_size_x, dst_it.ptr()); - } - else + execute_window_loop( + dst_win, + [&](const Coordinates &) { - std::fill_n(dst_it.ptr(), window_size_x, 0); - } - }, src_it, idx_it, dst_it); + const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr()); + + if (idx >= 0 && idx < idx_limit) + { + const auto src_ptr = src_it.ptr() + idx * chunk_stride; + + std::copy_n(src_ptr, window_size_x, dst_it.ptr()); + } + else + { + std::fill_n(dst_it.ptr(), window_size_x, 0); + } + }, + src_it, idx_it, dst_it); } void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) @@ -137,13 +145,13 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe _output = output; _axis = axis; - if(_axis < 0) + if (_axis < 0) { _axis += input->info()->num_dimensions(); } ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions())); - switch(_indices->info()->data_type()) + switch (_indices->info()->data_type()) { case DataType::U32: _func = &NEGatherKernel::gather_common<uint32_t>; @@ -157,7 +165,8 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe } // Output auto initialization if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); // Create window @@ -169,30 +178,31 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe // These will be used to iterate lock-step through all tensors (input, indices and output). size_t dim_no = 0; - const auto input_info = input->info(); + const auto input_info = input->info(); const auto &input_strides = input_info->strides_in_bytes(); - const auto indices_info = indices->info(); - const auto &indices_strides = indices_info->strides_in_bytes(); - const auto indices_num_dims = indices_info->num_dimensions(); + const auto indices_info = indices->info(); + const auto &indices_strides = indices_info->strides_in_bytes(); + const auto indices_num_dims = indices_info->num_dimensions(); - for(; dim_no < static_cast<size_t>(_axis); ++dim_no) + for (; dim_no < static_cast<size_t>(_axis); ++dim_no) { _src_it_strides[dim_no] = input_strides[dim_no]; } - for(; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no) + for (; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no) { _idx_it_strides[dim_no] = indices_strides[dim_no - _axis]; } - for(; dim_no < Coordinates::num_max_dimensions; ++dim_no) + for (; dim_no < Coordinates::num_max_dimensions; ++dim_no) { _src_it_strides[dim_no] = input_strides[dim_no - indices_num_dims + 1]; } } -Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) +Status +NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); return Status{}; diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h index ce69daeda7..b8c069f99e 100644 --- a/src/core/NEON/kernels/NEGatherKernel.h +++ b/src/core/NEON/kernels/NEGatherKernel.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_NEGATHERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -92,8 +93,8 @@ private: ITensor *_output; kernel_ptr _func; - Strides _src_it_strides; - Strides _idx_it_strides; + Strides _src_it_strides; + Strides _idx_it_strides; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEGATHERKERNEL_H */ diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp index 7bba136e84..549319e49f 100644 --- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp +++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp @@ -27,11 +27,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/genproposals/list.h" + #include <arm_neon.h> namespace arm_compute @@ -44,7 +46,8 @@ struct ComputeAllAnchorsData }; using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type; -using ComputeAllAnchorsUKernelPtr = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type; +using ComputeAllAnchorsUKernelPtr = std::add_pointer<void( + const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type; struct ComputeAllAnchorsKernel { @@ -53,27 +56,17 @@ struct ComputeAllAnchorsKernel ComputeAllAnchorsUKernelPtr ukernel; }; -static const ComputeAllAnchorsKernel available_kernels[] = -{ +static const ComputeAllAnchorsKernel available_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_qu16_computeallanchors", - [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors) - }, + {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "neon_fp16_computeallanchors", - [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors) - }, + {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "neon_fp32_computeallanchors", - [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors) - }, + {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)}, }; /** Micro-kernel selector @@ -84,9 +77,9 @@ static const ComputeAllAnchorsKernel available_kernels[] = */ const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -101,7 +94,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2); - if(all_anchors->total_size() > 0) + if (all_anchors->total_size() > 0) { const size_t feature_height = info.feat_height(); const size_t feature_width = info.feat_width(); @@ -111,7 +104,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors); - if(is_data_type_quantized(anchors->data_type())) + if (is_data_type_quantized(anchors->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors); } @@ -139,7 +132,8 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a // Initialize the output if empty const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors); - auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); + auto_init_if_empty(*all_anchors->info(), + TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); // Set instance variables _anchors = anchors; @@ -151,7 +145,9 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a INEKernel::configure(win); } -Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info) +Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, + const ITensorInfo *all_anchors, + const ComputeAnchorsInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info)); return Status{}; @@ -163,7 +159,7 @@ void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() }); + const auto *uk = get_implementation(ComputeAllAnchorsData{_anchors->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_anchors, _all_anchors, _anchors_info, window); diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h index 297d6d4abe..30699eee01 100644 --- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h +++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h @@ -78,5 +78,5 @@ private: ITensor *_all_anchors; ComputeAnchorsInfo _anchors_info; }; -} // arm_compute +} // namespace arm_compute #endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp index 71641404bf..0a1780f6ee 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp @@ -31,12 +31,13 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/instancenorm/list.h" #include <arm_neon.h> @@ -51,7 +52,13 @@ struct InstanceNormSelectorData }; using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type; -using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type; +using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, + ITensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision, + const Window &window)>::type; struct InstanceNormKernel { @@ -60,19 +67,12 @@ struct InstanceNormKernel InstanceNormUKernelPtr ukernel; }; -static const InstanceNormKernel available_kernels[] = -{ - { - "fp32_neon_instancenorm", - [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm) - }, +static const InstanceNormKernel available_kernels[] = { + {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_instancenorm", - [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm) - }, + {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC }; @@ -84,9 +84,9 @@ static const InstanceNormKernel available_kernels[] = */ const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -102,14 +102,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, + "NHWC data layout is not supported by the kernel directly"); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); } return Status{}; } @@ -132,7 +134,9 @@ NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel() { } -void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info) +void NEInstanceNormalizationLayerKernel::configure(ITensor *input, + ITensor *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -152,10 +156,13 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp INEKernel::configure(std::get<1>(win_config)); } -Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info) +Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); return Status{}; } @@ -165,7 +172,7 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() }); + const auto *uk = get_implementation(InstanceNormSelectorData{_input->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window); diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h index f166ce2058..024ccd9ef2 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h @@ -68,7 +68,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -82,14 +83,15 @@ private: * @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to 0.0 * @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12 */ - using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); + using NormalizationFunction = + void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); ITensor *_input; ITensor *_output; float _gamma; float _beta; float _epsilon; - bool _use_mixed_precision{ true }; + bool _use_mixed_precision{true}; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */ diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp index 8ab0288ab1..eea57a17d3 100644 --- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp +++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp @@ -30,11 +30,12 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/common/cpuinfo/CpuIsaInfo.h" -#include "src/core/NEON/NEMath.h" #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" #include "src/cpu/kernels/l2normlayer/list.h" #include <arm_neon.h> @@ -55,7 +56,8 @@ struct L2NormalizeLayerSelectorData using L2NormalizeLayerKernelSelctorPtr = std::add_pointer<bool(const L2NormalizeLayerSelectorData &data)>::type; -using L2NormalizeLayerPtr = std::add_pointer<void(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type; +using L2NormalizeLayerPtr = std::add_pointer<void( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type; struct L2NormalizeLayerKernel { @@ -64,26 +66,25 @@ struct L2NormalizeLayerKernel L2NormalizeLayerPtr ukernel; }; -static const L2NormalizeLayerKernel available_kernels[] = -{ - { - "fp32_neon_l2normalize_x", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x) - }, - { - "fp32_neon_l2normalize_yz", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz) - }, +static const L2NormalizeLayerKernel available_kernels[] = { + {"fp32_neon_l2normalize_x", + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)}, + {"fp32_neon_l2normalize_yz", + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)}, { "fp16_neon_l2normalize_x", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; }, + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_x), }, { "fp16_neon_l2normalize_yz", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; }, + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_yz), }, }; @@ -96,9 +97,9 @@ static const L2NormalizeLayerKernel available_kernels[] = */ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -106,7 +107,8 @@ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorD return nullptr; } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_UNUSED(epsilon); @@ -115,14 +117,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, + "Actual normalization axis greater than max number of dimensions"); // Reduce shape on axis TensorShape sum_shape = input->tensor_shape(); sum_shape.set(actual_axis, 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -151,7 +154,8 @@ NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel() { } -void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon) +void NEL2NormalizeLayerKernel::configure( + const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon)); @@ -169,10 +173,12 @@ void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *su INEKernel::configure(std::get<1>(win_config)); } -Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status NEL2NormalizeLayerKernel::validate( + const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); return Status{}; } @@ -183,12 +189,13 @@ void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - if(_actual_axis > 2) + if (_actual_axis > 2) { ARM_COMPUTE_ERROR("Unsupported normalization axis"); } - const auto *uk = get_implementation(L2NormalizeLayerSelectorData{ _output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa() }); + const auto *uk = get_implementation( + L2NormalizeLayerSelectorData{_output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h index af3ad3403e..3524e66a21 100644 --- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h +++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h @@ -74,7 +74,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); + static Status + validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp index 6939e08ef0..6be6284528 100644 --- a/src/core/NEON/kernels/NELogicalKernel.cpp +++ b/src/core/NEON/kernels/NELogicalKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,7 +51,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1); ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16))); src0 += step; @@ -58,7 +59,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8))); src0 += half_step; @@ -66,7 +67,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src0) && (*src1); ++src0; @@ -84,21 +85,21 @@ void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8 const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s); const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16)); src += step; dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8)); src += half_step; dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src) && broadcast_val_clamped_s; ++src; @@ -112,7 +113,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1); ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16))); src0 += step; @@ -120,7 +121,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8))); src0 += half_step; @@ -128,7 +129,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src0) || (*src1); ++src0; @@ -146,21 +147,21 @@ void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_ const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s); const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16)); src += step; dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8)); src += half_step; dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src) || broadcast_val_clamped_s; ++src; @@ -173,21 +174,21 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len) ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16)); src += step; dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8)); src += half_step; dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = !(*src); ++src; @@ -197,18 +198,15 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len) void run_unary(const Window &window, const ITensor *src, ITensor *dst) { - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); const auto len = window.x().end() - window.x().start(); Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - neon_logical_not(in.ptr(), out.ptr(), len); - }, - in, out); + execute_window_loop( + win, [&](const Coordinates &) { neon_logical_not(in.ptr(), out.ptr(), len); }, in, out); } void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op) @@ -216,16 +214,17 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); const auto len = window.x().end() - window.x().start(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { - using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type; - LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast; + using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type; + LogicalBroadcastUKernelPtr logical_func = + op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast; const bool is_broadcast_input_1 = src1_win.x().step() == 0; Window broadcast_win = is_broadcast_input_1 ? src1_win : src0_win; @@ -238,17 +237,18 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const uint8_t broadcast_value = *broadcast_in.ptr(); - logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len); - - }, - broadcast_in, non_broadcast_in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const uint8_t broadcast_value = *broadcast_in.ptr(); + logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len); + }, + broadcast_in, non_broadcast_in, out); } else { - using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type; + using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type; LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and; src0_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -257,11 +257,8 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, Iterator in0(src0, src0_win); Iterator in1(src1, src1_win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); - }, - in0, in1, out); + execute_window_loop( + win, [&](const Coordinates &) { logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); }, in0, in1, out); } } } // namespace @@ -270,7 +267,10 @@ const char *NELogicalKernel::name() const return "NELogicalKernel"; } -void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op) +void NELogicalKernel::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + ITensorInfo *output, + LogicalOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output); ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op)); @@ -279,7 +279,7 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in Window win = calculate_max_window(*input1, Steps()); TensorShape out_shape = input1->tensor_shape(); - if(op != LogicalOperation::Not) + if (op != LogicalOperation::Not) { ARM_COMPUTE_ERROR_ON_NULLPTR(input2); out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); @@ -292,13 +292,16 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in set_data_type_if_unknown(*output, input1->data_type()); } -Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op) +Status NELogicalKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + LogicalOperation op) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown); TensorShape out_shape = input1->tensor_shape(); - if(op != LogicalOperation::Not) + if (op != LogicalOperation::Not) { out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -306,7 +309,7 @@ Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *i } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); @@ -326,7 +329,7 @@ void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const T const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - if(_op == LogicalOperation::Not) + if (_op == LogicalOperation::Not) { run_unary(window, src0, dst); } diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h index caf69cf45d..477a59d826 100644 --- a/src/core/NEON/kernels/NELogicalKernel.h +++ b/src/core/NEON/kernels/NELogicalKernel.h @@ -58,10 +58,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op); + static Status + validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp index 37e88a8565..451031d696 100644 --- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp @@ -28,12 +28,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/meanstddevnorm/list.h" namespace arm_compute @@ -46,7 +47,8 @@ struct MeanStdDevNormSelectorData }; using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type; -using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type; +using MeanStdDevNormUKernelPtr = + std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type; struct MeanStdDevNormKernel { @@ -55,25 +57,15 @@ struct MeanStdDevNormKernel MeanStdDevNormUKernelPtr ukernel; }; -static const std::vector<MeanStdDevNormKernel> available_kernels = -{ - { - "fp32_neon_meanstddevnorm", - [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm) - }, +static const std::vector<MeanStdDevNormKernel> available_kernels = { + {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_meanstddevnorm", - [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm) - }, + {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "qasymm8_neon_meanstddevnorm", - [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm) - }, + {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)}, }; /** Micro-kernel selector @@ -84,9 +76,9 @@ static const std::vector<MeanStdDevNormKernel> available_kernels = */ const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -103,7 +95,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -113,7 +105,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) { - if(output != nullptr) + if (output != nullptr) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized @@ -128,8 +120,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() - : _input(nullptr), _output(nullptr), _epsilon(1e-8f) +NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() : _input(nullptr), _output(nullptr), _epsilon(1e-8f) { } @@ -137,7 +128,8 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output, { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); + ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate( + input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); _input = input; _output = (output == nullptr) ? input : output; @@ -152,7 +144,9 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output, Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr) + .first); return Status{}; } @@ -162,7 +156,7 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() }); + const auto *uk = get_implementation(MeanStdDevNormSelectorData{_output->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_input, _output, _epsilon, window); diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp index 49a045382d..2c61bda147 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -29,19 +29,23 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/NormalizationHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); @@ -52,7 +56,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -69,7 +73,10 @@ NENormalizationLayerKernel::NENormalizationLayerKernel() { } -void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info) +void NENormalizationLayerKernel::configure(const ITensor *input, + const ITensor *input_squared, + ITensor *output, + NormalizationLayerInfo norm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output); // Output tensor auto initialization if not yet initialized @@ -85,15 +92,15 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * _output = output; _norm_info = norm_info; - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { case DataType::F32: { - switch(norm_idx) + switch (norm_idx) { case 0: { - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>; } @@ -104,7 +111,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * break; } case 1: - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>; } @@ -124,11 +131,11 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { - switch(norm_idx) + switch (norm_idx) { case 0: { - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>; } @@ -139,7 +146,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * break; } case 1: - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>; } @@ -196,8 +203,9 @@ void NENormalizationLayerKernel::normalize_float(const Window &window) const auto beta_vec = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{}); const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{}); - auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr, - T * output_ptr) + auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row, + const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr, + T *output_ptr) { const int current_slice = dim == 0 ? x : id[dim]; const int first_slice = std::max(current_slice - radius, 0); @@ -206,75 +214,87 @@ void NENormalizationLayerKernel::normalize_float(const Window &window) const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x; // Accumulate 2D In-Map values auto accu = static_cast<T>(0.f); - for(int j = first_row; j <= last_row; ++j) + for (int j = first_row; j <= last_row; ++j) { // Compute row displacement const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row; - for(int i = first_slice; i <= last_slice; ++i) + for (int i = first_slice; i <= last_slice; ++i) { - accu += *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice); + accu += + *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice); } } // Normalize - const auto normalized = std::pow(accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta()); + const auto normalized = std::pow( + accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta()); const auto normalized_pixel = (*(input_ptr + x)) / normalized; *(output_ptr + x) = normalized_pixel; }; - execute_window_loop(win, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - auto output_ptr = reinterpret_cast<T *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &id) + { + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); - // Get range to normalize - const int current_row = do_2D_norm ? id[dim_y] : 0; - const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0; - const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; - int x = window_start_x; - // Compute serially starting elements for the case x dimension is width - for(; x < radius && x < window_end_x && dim == 0; ++x) - { - sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr); - } + int x = window_start_x; + // Compute serially starting elements for the case x dimension is width + for (; x < radius && x < window_end_x && dim == 0; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } - // Compute vectorized - for(; x <= window_end_x - window_step_x - radius; x += window_step_x) - { - const int current_slice = dim == 0 ? x : id[dim]; - const int first_slice = std::max(current_slice - radius, 0); - const int last_slice = std::min(current_slice + radius, max_right); - - const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x; - // Accumulate 2D In-Map values - auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - for(int j = first_row; j <= last_row; ++j) + // Compute vectorized + for (; x <= window_end_x - window_step_x - radius; x += window_step_x) { - // Compute row displacement - const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row; - for(int i = first_slice; i <= last_slice; ++i) + const int current_slice = dim == 0 ? x : id[dim]; + const int first_slice = std::max(current_slice - radius, 0); + const int last_slice = std::min(current_slice + radius, max_right); + + const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x; + // Accumulate 2D In-Map values + auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + for (int j = first_row; j <= last_row; ++j) { - accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice))); + // Compute row displacement + const uint8_t *const input_squared_ptr = + input_squared_x_ptr + (j - current_row) * input_squared_stride_row; + for (int i = first_slice; i <= last_slice; ++i) + { + accu = wrapper::vadd( + accu, wrapper::vloadq(reinterpret_cast<const T *>( + input_squared_ptr + (i - current_slice) * input_squared_stride_slice))); + } } - } - // Normalize - const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec); - const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized)); - wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel); - } + // Normalize + const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec); + const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized)); + wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr); - } - }, - input, input_squared, output); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } + }, + input, input_squared, output); } -Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info) +Status NENormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + const NormalizationLayerInfo norm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info)); diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h index 53a06b9ed9..2d8d9f3d60 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h @@ -60,7 +60,8 @@ public: * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input. * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ - void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info); + void + configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info); /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], @@ -72,7 +73,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + NormalizationLayerInfo norm_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp index 734510b637..c9bcbc9127 100644 --- a/src/core/NEON/kernels/NEPadLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp @@ -28,26 +28,31 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &paddings, const PaddingMode mode) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &paddings, + const PaddingMode mode) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only constant padding mode is supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(paddings.size() > 4, "Padding list bigger than 4 dimensions"); - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings); - const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = + arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings); + const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -58,30 +63,34 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c template <typename T> void NEPadLayerKernel::run_pad_constant(const Window &window) { - Window output_window{ window }; + Window output_window{window}; output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); const size_t element_size = _input->info()->element_size(); Iterator output_it(_output, output_window); - execute_window_loop(output_window, [&](const Coordinates & id) - { - Coordinates idin{ id }; - for(size_t dim = _padding.size() - 1; dim > 0; --dim) + execute_window_loop( + output_window, + [&](const Coordinates &id) { - idin[dim] -= _padding[dim].first; - if(idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim]) + Coordinates idin{id}; + for (size_t dim = _padding.size() - 1; dim > 0; --dim) { - std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0), _constant_value.get<T>()); - return; + idin[dim] -= _padding[dim].first; + if (idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim]) + { + std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0), + _constant_value.get<T>()); + return; + } } - } - T *input_it_ptr = reinterpret_cast<T *>(_input->ptr_to_element(idin)); - T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr()); - std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>()); - memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size); - std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, _constant_value.get<T>()); - }, - output_it); + T *input_it_ptr = reinterpret_cast<T *>(_input->ptr_to_element(idin)); + T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr()); + std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>()); + memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size); + std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, + _constant_value.get<T>()); + }, + output_it); } void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window) @@ -92,7 +101,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window const size_t end_plane = window.z().end(); size_t start_plane_input = start_plane; - if(_padding.size() > 2) + if (_padding.size() > 2) { start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first; } @@ -105,18 +114,20 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window const size_t jump_to_next_row_input = _input->info()->dimension(0); const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second; - uint8_t *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size; - const uint8_t *input_it_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size; - const auto pad_value = _constant_value.get<uint8_t>(); + uint8_t *output_row_ptr = + _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size; + const uint8_t *input_it_ptr = + _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size; + const auto pad_value = _constant_value.get<uint8_t>(); - for(size_t z_i = start_plane; z_i < end_plane; ++z_i) + for (size_t z_i = start_plane; z_i < end_plane; ++z_i) { - if(_padding.size() > 2 && z_i < _padding[2].first) + if (_padding.size() > 2 && z_i < _padding[2].first) { memset(output_row_ptr, pad_value, output_plane_size); output_row_ptr += output_plane_size; } - else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1)) + else if (_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1)) { memset(output_row_ptr, pad_value, output_plane_size); output_row_ptr += output_plane_size; @@ -127,7 +138,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window output_row_ptr += pad_y_elems_top; size_t y_i = _input->info()->dimension(1); // Basic loop unrolling - for(; y_i > 3; y_i -= 4) + for (; y_i > 3; y_i -= 4) { memset(output_row_ptr, pad_value, _padding[0].first); output_row_ptr += _padding[0].first; @@ -160,7 +171,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window memset(output_row_ptr, pad_value, _padding[0].second); output_row_ptr += _padding[0].second; } - for(; y_i > 0; --y_i) + for (; y_i > 0; --y_i) { memset(output_row_ptr, pad_value, _padding[0].first); output_row_ptr += _padding[0].first; @@ -183,12 +194,17 @@ NEPadLayerKernel::NEPadLayerKernel() { } -void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +void NEPadLayerKernel::configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Auto-init - const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding); - const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = + arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding); + const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape); auto_init_if_empty(*output->info(), expected_output_info); // Perform validation step @@ -200,14 +216,14 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL _constant_value = constant_value; _mode = mode; - if(_mode == PaddingMode::CONSTANT) + if (_mode == PaddingMode::CONSTANT) { - switch(_input->info()->element_size()) + switch (_input->info()->element_size()) { case 1: - if(_input->info()->num_dimensions() == 3 && // Is 3D - padding.size() <= 3 && // Has 3D padding - !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding + if (_input->info()->num_dimensions() == 3 && // Is 3D + padding.size() <= 3 && // Has 3D padding + !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding { _func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad; } @@ -240,7 +256,11 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL ICPPKernel::configure(win); } -Status NEPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +Status NEPadLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_UNUSED(constant_value); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, mode)); @@ -253,7 +273,7 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - if(_func != nullptr) + if (_func != nullptr) { (this->*_func)(window); } @@ -263,7 +283,7 @@ size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) c { ARM_COMPUTE_UNUSED(thread_count); ARM_COMPUTE_UNUSED(platform); - + return ICPPKernel::default_mws; } diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h index f82af1558a..d432887d2c 100644 --- a/src/core/NEON/kernels/NEPadLayerKernel.h +++ b/src/core/NEON/kernels/NEPadLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEPADLAYERKERNEL_H #include "arm_compute/core/PixelValue.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -62,7 +63,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT. * Only CONSTANT padding mode is currently supported */ - void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + void configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); /** Static function to check if given info will lead to a valid configuration of @ref NEPadLayer. * * @param[in] input Source tensor info. Data types supported: All. @@ -75,7 +80,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp index 3d89933377..15e933e66e 100644 --- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -36,7 +37,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status validate_arguments(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); @@ -45,10 +49,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, // Check variances const int var_size = info.variances().size(); - if(var_size > 1) + if (var_size > 1) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values"); - for(int i = 0; i < var_size; ++i) + for (int i = 0; i < var_size; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0"); } @@ -56,17 +60,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0"); - if(!info.max_sizes().empty()) + if (!info.max_sizes().empty()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), + "Max and min sizes dimensions should match"); } - for(unsigned int i = 0; i < info.max_sizes().size(); ++i) + for (unsigned int i = 0; i < info.max_sizes().size(); ++i) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], + "Max size should be greater than min size"); } - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); @@ -76,21 +82,26 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, } } // namespace -NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr), _info() +NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr), _info() { } -void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, - const int height) +void NEPriorBoxLayerKernel::store_coordinates(float *out, + const int offset, + const float center_x, + const float center_y, + const float box_width, + const float box_height, + const int width, + const int height) { float xmin = (center_x - box_width / 2.f) / width; float ymin = (center_y - box_height / 2.f) / height; float xmax = (center_x + box_width / 2.f) / width; float ymax = (center_y + box_height / 2.f) / height; - float32x4_t vec_elements = { xmin, ymin, xmax, ymax }; - if(_info.clip()) + float32x4_t vec_elements = {xmin, ymin, xmax, ymax}; + if (_info.clip()) { static const float32x4_t CONST_0 = vdupq_n_f32(0.f); static const float32x4_t CONST_1 = vdupq_n_f32(1.f); @@ -112,7 +123,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window) int img_width = _info.img_size().x; int img_height = _info.img_size().y; - if(img_width == 0 || img_height == 0) + if (img_width == 0 || img_height == 0) { img_width = _input2->info()->dimension(width_idx); img_height = _input2->info()->dimension(height_idx); @@ -120,7 +131,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window) float step_x = _info.steps()[0]; float step_y = _info.steps()[1]; - if(step_x == 0.f || step_y == 0.f) + if (step_x == 0.f || step_y == 0.f) { step_x = static_cast<float>(img_width) / layer_width; step_y = static_cast<float>(img_height) / layer_height; @@ -130,74 +141,80 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window) slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2)); Iterator output(_output, slice); - execute_window_loop(slice, [&](const Coordinates & id) - { - float center_x = 0; - float center_y = 0; - int idx = id.x() / (4 * num_priors); - center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x; - center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y; - - float box_width; - float box_height; - int offset = 0; - - auto out = reinterpret_cast<float *>(output.ptr()); - for(unsigned int i = 0; i < _info.min_sizes().size(); ++i) + execute_window_loop( + slice, + [&](const Coordinates &id) { - const float min_size = _info.min_sizes().at(i); - box_width = min_size; - box_height = min_size; - store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); - offset += 4; - - if(!_info.max_sizes().empty()) + float center_x = 0; + float center_y = 0; + int idx = id.x() / (4 * num_priors); + center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x; + center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y; + + float box_width; + float box_height; + int offset = 0; + + auto out = reinterpret_cast<float *>(output.ptr()); + for (unsigned int i = 0; i < _info.min_sizes().size(); ++i) { - const float max_size = _info.max_sizes().at(i); - box_width = std::sqrt(min_size * max_size); - box_height = box_width; - + const float min_size = _info.min_sizes().at(i); + box_width = min_size; + box_height = min_size; store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); offset += 4; - } - // rest of priors - for(auto ar : _info.aspect_ratios()) - { - if(fabs(ar - 1.) < 1e-6) + if (!_info.max_sizes().empty()) { - continue; + const float max_size = _info.max_sizes().at(i); + box_width = std::sqrt(min_size * max_size); + box_height = box_width; + + store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); + offset += 4; } - box_width = min_size * sqrt(ar); - box_height = min_size / sqrt(ar); + // rest of priors + for (auto ar : _info.aspect_ratios()) + { + if (fabs(ar - 1.) < 1e-6) + { + continue; + } - store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); - offset += 4; + box_width = min_size * sqrt(ar); + box_height = min_size / sqrt(ar); + + store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); + offset += 4; + } } - } - // set the variance - out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1))); - float32x4_t var; - if(_info.variances().size() == 1) - { - var = vdupq_n_f32(_info.variances().at(0)); - } - else - { - const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) }; - var = vars; - } - for(int i = 0; i < num_priors; ++i) - { - vst1q_f32(out + 4 * i, var); - } - }, - output); + // set the variance + out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1))); + float32x4_t var; + if (_info.variances().size() == 1) + { + var = vdupq_n_f32(_info.variances().at(0)); + } + else + { + const float32x4_t vars = {_info.variances().at(0), _info.variances().at(1), _info.variances().at(2), + _info.variances().at(3)}; + var = vars; + } + for (int i = 0; i < num_priors; ++i) + { + vst1q_f32(out + 4 * i, var); + } + }, + output); } -void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info) +void NEPriorBoxLayerKernel::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); @@ -215,7 +232,10 @@ void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *inpu INEKernel::configure(win); } -Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info)); @@ -231,4 +251,4 @@ void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info) // Run function calculate_prior_boxes(window); } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h index 430a47f9f8..460f80e085 100644 --- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h +++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h @@ -67,7 +67,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -84,7 +87,14 @@ private: * @param[in] width Input width. * @param[in] height Input height. */ - void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height); + void store_coordinates(float *out, + const int offset, + const float center_x, + const float center_y, + const float box_width, + const float box_height, + const int width, + const int height); /** Function to calculate prior boxes. * * @param[in] window Input region on which to execute the kernel. diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp index 46a0f625ce..8e1ed3a2a5 100644 --- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp @@ -26,17 +26,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/NESymm.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/NESymm.h" #include <map> @@ -72,8 +72,8 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4 const int64_t b_3 = vgetlane(b_high, 1); int64x2x2_t result; - const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 }; - const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 }; + const int64x2_t result_0{a_0 * b_0, a_1 * b_1}; + const int64x2_t result_1{a_2 * b_2, a_3 * b_3}; result.val[0] = vadd(vmovl(vgetlow(bias)), result_0); result.val[1] = vadd(vmovl(vgethigh(bias)), result_1); @@ -81,15 +81,17 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4 } } // namespace -void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias) +void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, + ITensor *output, + const ITensor *weight, + const ITensor *bias) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output); ARM_COMPUTE_ERROR_ON(input == output); ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info())); - static const std::map<DataType, ComputeFuncType> fn_map = - { - { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) }, + static const std::map<DataType, ComputeFuncType> fn_map = { + {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)}, }; _input = input; @@ -102,10 +104,10 @@ void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *o _output->info()->set_quantization_info(compute_output_qinfo()); const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform(); - const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift); + const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift); _output_shift *= -1; - if(!bool(s)) + if (!bool(s)) { _output_multiplier = 0; _output_shift = 0; @@ -134,7 +136,10 @@ Window NEQLSTMLayerNormalizationKernel::configure_window(ITensor *target) return window; } -Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_UNUSED(output, bias, weight, input); @@ -151,7 +156,7 @@ Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -182,11 +187,11 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16( using AccType = int64_t; using InputDataType = int16_t; - AccType sum{ 0 }; - AccType sum_sq{ 0 }; + AccType sum{0}; + AccType sum_sq{0}; int32_t x = _window_start_x; - for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) + for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) { using namespace wrapper; const int16x8_t val = vloadq(input_ptr + x); @@ -216,7 +221,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16( #endif // __aarch64__ } - for(; x < _window_end_x; ++x) + for (; x < _window_end_x; ++x) { const InputDataType val = input_ptr[x]; sum += static_cast<AccType>(val); @@ -230,7 +235,9 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i int16_t *output_ptr, const int16_t *weight_ptr, const int32_t *bias_ptr, - int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift) + int32_t mean, + int32_t inv_std_mul, + int32_t inv_std_shift) { using OutputDataType = int16_t; @@ -238,7 +245,7 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{}); int32_t x = _window_start_x; - for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) + for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) { const int16x8_t val = vloadq(input_ptr + x); int32x4x2_t shifted; @@ -267,16 +274,18 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i vstore(output_ptr + x + 4, vqmovn(out_val.val[1])); } - for(; x < _window_end_x; ++x) + for (; x < _window_end_x; ++x) { - const auto val = static_cast<int32_t>(input_ptr[x]); - const int32_t shifted = (val << 10) - mean; - const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift); - const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x]; + const auto val = static_cast<int32_t>(input_ptr[x]); + const int32_t shifted = (val << 10) - mean; + const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift); + const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x]; const auto reverse_shifted = static_cast<int32_t>((weighted + 512) >> 10); - int32_t out_val = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12); - out_val = utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min()); - output_ptr[x] = static_cast<OutputDataType>(out_val); + int32_t out_val = + quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12); + out_val = + utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min()); + output_ptr[x] = static_cast<OutputDataType>(out_val); } } @@ -287,35 +296,38 @@ void NEQLSTMLayerNormalizationKernel::compute_qsymm16() using BiasDataType = int32_t; using AccType = int64_t; - Iterator input_iterator{ _input, _inout_window }; - Iterator output_iterator{ _output, _inout_window }; - Iterator weight_iterator{ _weight, _weight_window }; - Iterator bias_iterator{ _bias, _weight_window }; + Iterator input_iterator{_input, _inout_window}; + Iterator output_iterator{_output, _inout_window}; + Iterator weight_iterator{_weight, _weight_window}; + Iterator bias_iterator{_bias, _weight_window}; const auto weight_ptr = reinterpret_cast<const InputDataType *>(weight_iterator.ptr()); const auto bias_ptr = reinterpret_cast<const BiasDataType *>(bias_iterator.ptr()); const uint32_t column_size = _input->info()->tensor_shape()[0]; - execute_window_loop(_inout_window, [ &, this](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const InputDataType *>(input_iterator.ptr()); - auto out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr()); - - AccType sum{ 0 }; - AccType sum_sq{ 0 }; - std::tie(sum, sum_sq) = sum_qsymm16(in_ptr); - - AccType mean{ 0 }; - AccType variance{ 0 }; - std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size); - - int32_t stddev_invsqrt_mul{}; - int32_t stddev_invsqrt_shift{}; - quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift); - - normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift); - }, - input_iterator, output_iterator); + execute_window_loop( + _inout_window, + [&, this](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const InputDataType *>(input_iterator.ptr()); + auto out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr()); + + AccType sum{0}; + AccType sum_sq{0}; + std::tie(sum, sum_sq) = sum_qsymm16(in_ptr); + + AccType mean{0}; + AccType variance{0}; + std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size); + + int32_t stddev_invsqrt_mul{}; + int32_t stddev_invsqrt_shift{}; + quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, + stddev_invsqrt_shift); + + normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift); + }, + input_iterator, output_iterator); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h index a3ff6e988f..af5b6a0315 100644 --- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h +++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H #include "src/core/NEON/INEKernel.h" + #include <functional> namespace arm_compute @@ -69,34 +70,26 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: // constants - static constexpr uint32_t max_input_dimension{ 2 }; /**< The maximum input dimension supported */ - static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */ - static constexpr uint32_t max_bias_dimension{ 1 }; /**< The maximum bias dimension supported */ - static constexpr uint32_t vector_size_byte{ 16 }; /**< Computation vector size in byte */ + static constexpr uint32_t max_input_dimension{2}; /**< The maximum input dimension supported */ + static constexpr uint32_t max_weight_dimension{1}; /**< The maximum weight dimension supported */ + static constexpr uint32_t max_bias_dimension{1}; /**< The maximum bias dimension supported */ + static constexpr uint32_t vector_size_byte{16}; /**< Computation vector size in byte */ using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>; ComputeFuncType _fn{}; /**< Function pointer to computation function */ - const ITensor *_input - { - nullptr - }; /**< Input tensor */ - const ITensor *_weight - { - nullptr - }; /**< Weight tensor */ - const ITensor *_bias - { - nullptr - }; /**< Bias tensor */ - ITensor *_output{ nullptr }; /**< Output tensor */ + const ITensor *_input{nullptr}; /**< Input tensor */ + const ITensor *_weight{nullptr}; /**< Weight tensor */ + const ITensor *_bias{nullptr}; /**< Bias tensor */ + ITensor *_output{nullptr}; /**< Output tensor */ int32_t _output_multiplier{}; /**< Multiplier for output values */ int32_t _output_shift{}; /**< Shift value for output values */ @@ -138,7 +131,9 @@ private: int16_t *output_ptr, const int16_t *weight_ptr, const int32_t *bias_ptr, - int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift); + int32_t mean, + int32_t inv_std_mul, + int32_t inv_std_shift); /** Function to compute output quantization information */ QuantizationInfo compute_output_qinfo(); }; diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp index 802aebb526..486cd6d331 100644 --- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp +++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/misc/Utility.h" -#include "src/core/CPP/Validate.h" +#include "arm_compute/core/Window.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/roialign/list.h" @@ -49,7 +50,12 @@ struct ROIAlignSelectorData }; using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type; -using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type; +using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info)>::type; struct ROIAlignKernel { @@ -58,31 +64,18 @@ struct ROIAlignKernel ROIAlignUKernelPtr ukernel; }; -static const ROIAlignKernel available_kernels[] = -{ - { - "fp32_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign) - }, +static const ROIAlignKernel available_kernels[] = { + {"fp32_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign) - }, + {"fp16_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #if defined(ARM_COMPUTE_ENABLE_NEON) - { - "qu8_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign) - }, - { - "qs8_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign) - }, + {"qu8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)}, + {"qs8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) }; @@ -94,9 +87,9 @@ static const ROIAlignKernel available_kernels[] = */ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -104,24 +97,29 @@ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data) return nullptr; } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5); ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), + output->tensor_shape()); } - if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED) + if (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16); @@ -143,13 +141,17 @@ NEROIAlignLayerKernel::NEROIAlignLayerKernel() { } -void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIAlignLayerKernel::configure(const ITensor *input, + const ITensor *rois, + ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info)); // Output auto inizialitation if not yet initialized const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info); - auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); output->info()->set_data_layout(input->info()->data_layout()); // Configure kernel window @@ -167,7 +169,10 @@ void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, INEKernel::configure(window); } -Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info)); return Status{}; @@ -176,9 +181,9 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info) { const DataLayout data_layout = _input->info()->data_layout(); - if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC) { - const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() }); + const auto *uk = get_implementation(ROIAlignSelectorData{_input->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_input, _output, _rois, _pool_info, window, info); diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h index 48a3de7285..9cc538b429 100644 --- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h +++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h @@ -83,7 +83,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp index 400e8291d6..1a3810fb56 100644 --- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" + #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -36,7 +38,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, rois); @@ -47,10 +52,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height())); + ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || + (output->dimension(1) != pool_info.pooled_height())); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2)); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3)); } @@ -73,19 +79,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con * @param[in] roi_indx Index of image of coordinate in output Tensor to store value */ template <typename T> -void template_eval(const ITensor *input, const ITensor *output, int region_start_x, int region_start_y, - int region_end_x, int region_end_y, int fm, int px, int py, int roi_batch, int roi_indx) +void template_eval(const ITensor *input, + const ITensor *output, + int region_start_x, + int region_start_y, + int region_end_x, + int region_end_y, + int fm, + int px, + int py, + int roi_batch, + int roi_indx) { - if((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) { *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0; } else { T curr_max = std::numeric_limits<T>::lowest(); // Min value of typename T - for(int j = region_start_y; j < region_end_y; ++j) + for (int j = region_start_y; j < region_end_y; ++j) { - for(int i = region_start_x; i < region_end_x; ++i) + for (int i = region_start_x; i < region_end_x; ++i) { const auto val = *reinterpret_cast<const T *>(input->ptr_to_element(Coordinates(i, j, fm, roi_batch))); curr_max = std::max(val, curr_max); @@ -93,11 +108,13 @@ void template_eval(const ITensor *input, const ITensor *output, int region_start } // if quantized datatype, requantize then store in output tensor - if(is_data_type_quantized(input->info()->data_type())) + if (is_data_type_quantized(input->info()->data_type())) { // covert qasymm to new output quantization scale and offset - UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform()); - *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = quantize_qasymm8(curr_max, uqinfo); + UniformQuantizationInfo uqinfo = compute_requantization_scale_offset( + input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform()); + *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = + quantize_qasymm8(curr_max, uqinfo); } else { @@ -112,13 +129,19 @@ NEROIPoolingLayerKernel::NEROIPoolingLayerKernel() { } -Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info)); return Status{}; } -void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIPoolingLayerKernel::configure(const ITensor *input, + const ITensor *rois, + const ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois); @@ -126,12 +149,15 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *roi ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info)); // Output auto initialization if not yet initialized - TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1)); + TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), + rois->info()->dimension(1)); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), output->info()->quantization_info()); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + output->info()->quantization_info()); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height())); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || + (output->info()->dimension(1) != pool_info.pooled_height())); // Set instance variables _input = input; @@ -167,7 +193,7 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) const auto *rois_ptr = reinterpret_cast<const uint16_t *>(_rois->buffer()); const auto data_type = _input->info()->data_type(); - for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) + for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) { const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx]; const auto x1 = rois_ptr[values_per_roi * roi_indx + 1]; @@ -182,30 +208,35 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) const int roi_height = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f); // Iterate through all feature maps - for(int fm = 0; fm < fms; ++fm) + for (int fm = 0; fm < fms; ++fm) { // Iterate through all output pixels - for(int py = 0; py < pooled_h; ++py) + for (int py = 0; py < pooled_h; ++py) { - for(int px = 0; px < pooled_w; ++px) + for (int px = 0; px < pooled_w; ++px) { auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width)); - auto region_end_x = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width)); - auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height)); - auto region_end_y = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height)); + auto region_end_x = + static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width)); + auto region_start_y = + static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height)); + auto region_end_y = + static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height)); region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width); region_end_x = std::min(std::max(region_end_x + roi_anchor_x, 0), width); region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height); region_end_y = std::min(std::max(region_end_y + roi_anchor_y, 0), height); - switch(data_type) + switch (data_type) { case DataType::F32: - template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx); + template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x, + region_end_y, fm, px, py, roi_batch, roi_indx); break; case DataType::QASYMM8: - template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx); + template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x, + region_end_y, fm, px, py, roi_batch, roi_indx); break; default: ARM_COMPUTE_ERROR("DataType not Supported"); @@ -216,4 +247,4 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) } } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h index e7a7e90eef..81f6006ea2 100644 --- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h +++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h @@ -63,7 +63,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor. */ - void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -82,7 +83,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); private: const ITensor *_input; diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp index ec63a35de9..87b7b76b72 100644 --- a/src/core/NEON/kernels/NERangeKernel.cpp +++ b/src/core/NEON/kernels/NERangeKernel.cpp @@ -29,11 +29,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/range/list.h" namespace arm_compute @@ -55,48 +56,23 @@ struct RangeUKernel RangeUKernelPtr ukernel; }; -static const RangeUKernel available_kernels[] = -{ - { - "fp16_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function) - }, - { - "f32_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function) - }, - { - "u8_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function) - }, - { - "u16_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::U16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function) - }, - { - "u32_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::U32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function) - }, - { - "s8_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::S8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function) - }, - { - "s16_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function) - }, - { - "s32_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function) - }, +static const RangeUKernel available_kernels[] = { + {"fp16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)}, + {"f32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)}, + {"u8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)}, + {"u16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)}, + {"u32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)}, + {"s8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)}, + {"s16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)}, + {"s32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)}, }; /** Micro-kernel selector @@ -107,9 +83,9 @@ static const RangeUKernel available_kernels[] = */ const RangeUKernel *get_implementation(const RangeSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -119,28 +95,31 @@ const RangeUKernel *get_implementation(const RangeSelectorData &data) Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step) { - const auto *uk = get_implementation(RangeSelectorData{ output.data_type() }); + const auto *uk = get_implementation(RangeSelectorData{output.data_type()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), + "start value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), + "end value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), + "step value is outside the range of the data type"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), + "Output tensor size is incorrect"); return Status{}; } } // namespace -NERangeKernel::NERangeKernel() - : _start(0), _end(1), _step(1), _output(nullptr) +NERangeKernel::NERangeKernel() : _start(0), _end(1), _step(1), _output(nullptr) { } @@ -151,7 +130,8 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step)); // Auto initialize output if not initialized - auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, output->info()->data_type(), output->info()->quantization_info()); + auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, + output->info()->data_type(), output->info()->quantization_info()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -178,7 +158,7 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() }); + const auto *uk = get_implementation(RangeSelectorData{_output->info()->data_type()}); uk->ukernel(_output, _start, _step, window); } diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h index 90560995e6..fa555c2c2e 100644 --- a/src/core/NEON/kernels/NERangeKernel.h +++ b/src/core/NEON/kernels/NERangeKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NERANGEKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index 19955af493..455d604b3b 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -28,16 +28,17 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/INEKernel.h" -#include "src/core/NEON/NEMath.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/INEKernel.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "support/SaturateCast.h" -#include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> namespace arm_compute @@ -48,7 +49,7 @@ namespace template <typename T> void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0) { - if(std::is_same<T, uint8_t>::value) + if (std::is_same<T, uint8_t>::value) { auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2)); wrapper::vstore(output.ptr() + offset, res); @@ -63,8 +64,8 @@ void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset template <typename T> uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) { - uint32x4_t mask{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint32x4_t mask{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { mask = wrapper::vcgt(b, a); } @@ -73,12 +74,12 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp mask = wrapper::vclt(b, a); } - uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 }; - if(axis != 0) + uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3}; + if (axis != 0) { vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); } - uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } }; + uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}}; return res; } @@ -86,9 +87,9 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp template <typename T> uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) { - uint32x4x4_t mask{ { 0 } }; - uint8x16_t mask_u8{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint32x4x4_t mask{{0}}; + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { mask_u8 = wrapper::vcgt(b, a); } @@ -96,44 +97,43 @@ uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, R { mask_u8 = wrapper::vclt(b, a); } - auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); - auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); - mask.val[0] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); - mask.val[1] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); - mask.val[2] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); - mask.val[3] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); - - uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 }, - { idx + 4, idx + 5, idx + 6, idx + 7 }, - { idx + 8, idx + 9, idx + 10, idx + 11 }, - { idx + 12, idx + 13, idx + 14, idx + 15 } - } - }; - if(axis != 0) + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + mask.val[0] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + mask.val[1] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + mask.val[2] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + mask.val[3] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + + uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, + {idx + 4, idx + 5, idx + 6, idx + 7}, + {idx + 8, idx + 9, idx + 10, idx + 11}, + {idx + 12, idx + 13, idx + 14, idx + 15}}}; + if (axis != 0) { vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); } - uint32x4x4_t res = - { - { - vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), - vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), - vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), - vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3]) - } - }; + uint32x4x4_t res = { + {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), + vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}}; return res; } // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. template <typename T> -inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, - typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type - calculate_min(T in) +inline typename std::enable_if< + std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, + typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type +calculate_min(T in) { auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); return wrapper::vpmin(pmin, pmin); @@ -141,9 +141,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_ // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. template <typename T> -inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, - typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type - calculate_min(T in) +inline typename std::enable_if< + std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, + typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type +calculate_min(T in) { auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); pmin = wrapper::vpmin(pmin, pmin); @@ -153,9 +154,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. template <typename T> -inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, - typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type - calculate_max(T in) +inline typename std::enable_if< + std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, + typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type +calculate_max(T in) { auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); return wrapper::vpmax(pmax, pmax); @@ -163,9 +165,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_ // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. template <typename T> -inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, - typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type - calculate_max(T in) +inline typename std::enable_if< + std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, + typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type +calculate_max(T in) { auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); pmax = wrapper::vpmax(pmax, pmax); @@ -176,10 +179,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s template <typename T> uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) { - uint32x4_t res_idx_mask{ 0 }; + uint32x4_t res_idx_mask{0}; uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); - if(op == ReductionOperation::ARG_IDX_MIN) + if (op == ReductionOperation::ARG_IDX_MIN) { auto pmin = calculate_min(vec_res_value); auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); @@ -203,10 +206,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, Reduc template <typename T> uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) { - uint32x4x4_t res_idx_mask{ { 0 } }; + uint32x4x4_t res_idx_mask{{0}}; uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); - uint8x16_t mask_u8{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { auto pmin = calculate_min(vec_res_value); mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); @@ -218,12 +221,18 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va } // Widen vectors - auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); - auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); - auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); - auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); - auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); - auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + auto wide_u32_3 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + auto wide_u32_4 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3); @@ -241,19 +250,19 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va pmin = wrapper::vpmin(pmin, pmin); res = std::min(wrapper::vgetlane(pmin, 0), res); iter++; - } - while(iter < 4); + } while (iter < 4); return (res - 0xFFFFFFFF); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> -uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) +uint32x4x4_t +calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) { - uint32x4x2_t mask{ 0 }; - uint16x8_t mask_u16{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint32x4x2_t mask{0}; + uint16x8_t mask_u16{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { mask_u16 = wrapper::vcgt(b, a); } @@ -263,19 +272,14 @@ uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x } mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16)); mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16)); - uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 }, - { idx + 4, idx + 5, idx + 6, idx + 7 } - } - }; - if(axis != 0) + uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}}; + if (axis != 0) { vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); } - uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), - wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), - 0, 0 - }; + uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), + wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0}; return res; } @@ -298,10 +302,10 @@ inline float16x4_t calculate_max(float16x8_t in) template <> uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op) { - uint32x4x2_t res_idx_mask{ 0 }; + uint32x4x2_t res_idx_mask{0}; uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); uint16x8_t mask_u16; - if(op == ReductionOperation::ARG_IDX_MIN) + if (op == ReductionOperation::ARG_IDX_MIN) { auto pmin = calculate_min(vec_res_value); mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); @@ -313,8 +317,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va } // Widen vectors - auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); - auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); @@ -328,8 +334,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va pmin = wrapper::vpmin(pmin, pmin); res = std::min(wrapper::vgetlane(pmin, 0), res); iter++; - } - while(iter < 2); + } while (iter < 2); return (res - 0xFFFFFFFF); } @@ -388,7 +393,8 @@ struct RedOpX /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) { const size_t input_dim_0 = in->info()->dimension(0); const int window_step_x = 16 / sizeof(T); @@ -402,211 +408,217 @@ struct RedOpX Iterator output(out, out_window); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - - auto init_res_value = static_cast<T>(0.f); - switch(op) + in_win_no_pad, + [&](const Coordinates &) { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - init_res_value = static_cast<T>(*input_ptr); - break; - } - case ReductionOperation::PROD: - { - init_res_value = static_cast<T>(1.f); - break; - } - default: - break; - } - auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); - uint32x4x4_t vec_res_idx{ { 0 } }; + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_elements = wrapper::vloadq(input_ptr + x); - switch(op) + auto init_res_value = static_cast<T>(0.f); + switch (op) { - case ReductionOperation::SUM_SQUARE: - vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); - break; - case ReductionOperation::MEAN_SUM: - case ReductionOperation::SUM: - vec_res_value = wrapper::vadd(vec_elements, vec_res_value); - break; - case ReductionOperation::PROD: - vec_res_value = wrapper::vmul(vec_elements, vec_res_value); - break; - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } case ReductionOperation::ARG_IDX_MAX: - { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } + case ReductionOperation::ARG_IDX_MIN: case ReductionOperation::MIN: + case ReductionOperation::MAX: { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + init_res_value = static_cast<T>(*input_ptr); break; } - case ReductionOperation::MAX: + case ReductionOperation::PROD: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + init_res_value = static_cast<T>(1.f); break; } default: - ARM_COMPUTE_ERROR("Not supported"); + break; } - } + auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); + uint32x4x4_t vec_res_idx{{0}}; - switch(op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - case ReductionOperation::SUM_SQUARE: + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { -#ifdef ARM_COMPUTE_DEBUG_ENABLED - auto res = static_cast<T>(0.f); - for(int i = 0; i < S; ++i) + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) { - res += wrapper::vgetlane(vec_res_value, i); + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } -#else // ARM_COMPUTE_DEBUG_ENABLED - auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); - for(int i = 0; i < S / 4; ++i) + } + + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM_SQUARE: { - carry_res = wrapper::vpadd(carry_res, carry_res); - } - auto res = wrapper::vgetlane(carry_res, 0); +#ifdef ARM_COMPUTE_DEBUG_ENABLED + auto res = static_cast<T>(0.f); + for (int i = 0; i < S; ++i) + { + res += wrapper::vgetlane(vec_res_value, i); + } +#else // ARM_COMPUTE_DEBUG_ENABLED + auto carry_res = + wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + for (int i = 0; i < S / 4; ++i) + { + carry_res = wrapper::vpadd(carry_res, carry_res); + } + auto res = wrapper::vgetlane(carry_res, 0); #endif // ARM_COMPUTE_DEBUG_ENABLED - if(op == ReductionOperation::SUM_SQUARE) - { - // Compute left-over elements - for(; x < window_end_x; ++x) + if (op == ReductionOperation::SUM_SQUARE) + { + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += (*(input_ptr + x)) * (*(input_ptr + x)); + } + } + else { - res += (*(input_ptr + x)) * (*(input_ptr + x)); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } } + + if (op == ReductionOperation::MEAN_SUM) + { + res /= input_dim_0; + } + + *(reinterpret_cast<T *>(output.ptr())) = res; + break; } - else + case ReductionOperation::PROD: { + auto carry_res = + wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + T res = 1; + for (int i = 0; i < S / 2; ++i) + { + res *= wrapper::vgetlane(carry_res, i); + } + // Compute left-over elements - for(; x < window_end_x; ++x) + for (; x < window_end_x; ++x) { - res += *(input_ptr + x); + res *= *(input_ptr + x); } - } - if(op == ReductionOperation::MEAN_SUM) - { - res /= input_dim_0; + *(reinterpret_cast<T *>(output.ptr())) = res; + break; } - - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::PROD: - { - auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); - T res = 1; - for(int i = 0; i < S / 2; ++i) + case ReductionOperation::ARG_IDX_MIN: { - res *= wrapper::vgetlane(carry_res, i); - } + auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res *= *(input_ptr + x); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; } - - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); - auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::ARG_IDX_MAX: { - if(*(input_ptr + x) < res) + auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; } - *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); - auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::MIN: { - if(*(input_ptr + x) > res) + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; } - *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; - break; - } - case ReductionOperation::MIN: - { - auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::MAX: { - res = *(input_ptr + x) < res ? *(input_ptr + x) : res; - } - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::MAX: - { - auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; } - *(reinterpret_cast<T *>(output.ptr())) = res; - break; + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input, output); + }, + input, output); } }; template <typename T> struct RedOpX_quantized { - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) { using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; @@ -637,246 +649,257 @@ struct RedOpX_quantized const float B = out_offset - (in_scale * in_offset) / (out_scale); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<T *>(input.ptr()); + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<T *>(input.ptr()); + + auto vec_res_value1 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value2 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value3 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value4 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + + auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f)); + auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f)); + auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f)); + auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f)); + + typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0}; + + if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || + op == ReductionOperation::MIN || op == ReductionOperation::MAX) + { + vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{}); + } + + uint32x4x4_t vec_res_idx{{0}}; + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f)); - auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f)); - auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f)); - auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f)); + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); + const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); - typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 }; + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX) - { - vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{}); - } + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - uint32x4x4_t vec_res_idx{ { 0 } }; - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_elements = wrapper::vloadq(input_ptr + x); - switch(op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - { - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); - vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); - vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); - vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); - break; - } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); - const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); - - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); - auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); - auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); - auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); - - //de-quantize vec_elements - temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); - temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); - temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); - temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); - - vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); - vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); - vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); - vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); - break; + auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } + } + + switch (op) + { case ReductionOperation::ARG_IDX_MIN: { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; + auto idx = + calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; break; } case ReductionOperation::ARG_IDX_MAX: { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; + auto idx = + calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; break; } case ReductionOperation::MIN: { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } - case ReductionOperation::MAX: - { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch(op) - { - case ReductionOperation::ARG_IDX_MIN: - { - auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); - auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - if(*(input_ptr + x) < res) + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; } - *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); - auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::MAX: { - if(*(input_ptr + x) > res) + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; } - *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; - break; - } - case ReductionOperation::MIN: - { - auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::PROD: { - res = *(input_ptr + x) < res ? *(input_ptr + x) : res; - } - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::MAX: - { - auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f); + carry_res = wrapper::vmul(carry_res, vec_res_value3_f); + carry_res = wrapper::vmul(carry_res, vec_res_value4_f); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res = *(input_ptr + x) > res ? *(input_ptr + x) : res; - } - *(reinterpret_cast<T *>(output.ptr())) = res; - break; - } - case ReductionOperation::PROD: - { - auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f); - carry_res = wrapper::vmul(carry_res, vec_res_value3_f); - carry_res = wrapper::vmul(carry_res, vec_res_value4_f); + float res = wrapper::vgetlane(carry_res, 0); + res *= wrapper::vgetlane(carry_res, 1); + res *= wrapper::vgetlane(carry_res, 2); + res *= wrapper::vgetlane(carry_res, 3); - float res = wrapper::vgetlane(carry_res, 0); - res *= wrapper::vgetlane(carry_res, 1); - res *= wrapper::vgetlane(carry_res, 2); - res *= wrapper::vgetlane(carry_res, 3); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + //de-quantize input + if (std::is_same<T, uint8_t>::value) + { + res *= dequantize_qasymm8(*(input_ptr + x), iq_info); + } + else + { + res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info); + } + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - //de-quantize input - if(std::is_same<T, uint8_t>::value) + //re-quantize result + if (std::is_same<T, uint8_t>::value) { - res *= dequantize_qasymm8(*(input_ptr + x), iq_info); + res = quantize_qasymm8(res, iq_info); } else { - res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info); + res = quantize_qasymm8_signed(res, iq_info); } - } - //re-quantize result - if(std::is_same<T, uint8_t>::value) - { - res = quantize_qasymm8(res, iq_info); + *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res); + break; } - else + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: { - res = quantize_qasymm8_signed(res, iq_info); - } + auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); + carry_res = wrapper::vadd(carry_res, vec_res_value3); + carry_res = wrapper::vadd(carry_res, vec_res_value4); - *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res); - break; - } - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - { - auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); - carry_res = wrapper::vadd(carry_res, vec_res_value3); - carry_res = wrapper::vadd(carry_res, vec_res_value4); + auto carry_paddition = + wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); + carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); + auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0)); - auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); - carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); - auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0)); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res += *(input_ptr + x); - } + if (op == ReductionOperation::MEAN_SUM) + { + const int32_t resFinal = A * (static_cast<float>(res)) + B; - if(op == ReductionOperation::MEAN_SUM) - { - const int32_t resFinal = A * (static_cast<float>(res)) + B; + *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal); + } + else + { + // Subtract accumulated offsets + res -= (in_info.dimension(0) - 1) * iq_info.offset; + *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res); + } - *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal); - } - else - { - // Subtract accumulated offsets - res -= (in_info.dimension(0) - 1) * iq_info.offset; - *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res); + break; } - - break; + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input, output); + }, + input, output); } }; @@ -887,7 +910,12 @@ struct RedOpYZW using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op) + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) { const TensorInfo in_info = *(in->info()); const int window_step_x = 16 / sizeof(T); @@ -900,203 +928,210 @@ struct RedOpYZW Window in_win_no_pad = in_window; in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<T *>(input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + in_win_no_pad, + [&](const Coordinates &) { - neon_vector vec_res_value = { 0 }; - switch(op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - vec_res_value = wrapper::vloadq(input_ptr + x); - break; - } - case ReductionOperation::PROD: - { - vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}); - break; - } - default: - { - vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - break; - } - } - uint32x4x4_t vec_res_idx{ { 0 } }; + const auto input_ptr = reinterpret_cast<T *>(input.ptr()); - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); - const auto vec_elements = wrapper::vloadq(in_ptr); - switch(op) + neon_vector vec_res_value = {0}; + switch (op) { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - vec_res_value = wrapper::vadd(vec_elements, vec_res_value); - break; - case ReductionOperation::SUM_SQUARE: - vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); - break; - case ReductionOperation::PROD: - vec_res_value = wrapper::vmul(vec_elements, vec_res_value); - break; - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; + vec_res_value = wrapper::vloadq(input_ptr + x); break; } - case ReductionOperation::MIN: + case ReductionOperation::PROD: { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}); break; } - case ReductionOperation::MAX: + default: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); break; } - default: - ARM_COMPUTE_ERROR("Not supported"); } - } - - if(op == ReductionOperation::MEAN_SUM) - { - auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{})); - vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); - } + uint32x4x4_t vec_res_idx{{0}}; - if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) - { - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - if(std::is_same<T, float16_t>::value) + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) { - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]); + const T *in_ptr = + reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - } - else - { - wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value); - } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto res_value = 0.f; - switch(op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: + if (op == ReductionOperation::MEAN_SUM) { - res_value = *(input_ptr + x); - break; + auto vec_width_inv = + wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{})); + vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); } - case ReductionOperation::PROD: + + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) { - res_value = static_cast<T>(1.f); - break; + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (std::is_same<T, float16_t>::value) + { + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]); + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC } - default: + else { - res_value = static_cast<T>(0.f); - break; + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value); } } - uint32_t res_idx = 0; - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + // Compute left-over elements + for (; x < window_end_x; ++x) { - const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); - - switch(op) + auto res_value = 0.f; + switch (op) { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - res_value += *in_ptr; - break; - case ReductionOperation::SUM_SQUARE: - res_value += *in_ptr * *in_ptr; - break; - case ReductionOperation::PROD: - res_value *= *in_ptr; - break; + case ReductionOperation::ARG_IDX_MAX: case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: { - if(*in_ptr < res_value) - { - res_value = *in_ptr; - res_idx = dim; - } + res_value = *(input_ptr + x); break; } - case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::PROD: { - if(*in_ptr > res_value) - { - res_value = *in_ptr; - res_idx = dim; - } + res_value = static_cast<T>(1.f); break; } - case ReductionOperation::MIN: + default: { - res_value = *in_ptr < res_value ? *in_ptr : res_value; + res_value = static_cast<T>(0.f); break; } - case ReductionOperation::MAX: + } + + uint32_t res_idx = 0; + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + + switch (op) { - res_value = *in_ptr > res_value ? *in_ptr : res_value; - break; + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + res_value += *in_ptr; + break; + case ReductionOperation::SUM_SQUARE: + res_value += *in_ptr * *in_ptr; + break; + case ReductionOperation::PROD: + res_value *= *in_ptr; + break; + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); } - } - if(op == ReductionOperation::MEAN_SUM) - { - res_value /= in_info.dimension(axis); - } + if (op == ReductionOperation::MEAN_SUM) + { + res_value /= in_info.dimension(axis); + } - if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) - { - *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx; - } - else - { - *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value; + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) + { + *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx; + } + else + { + *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value; + } } - } - }, - input, output); + }, + input, output); } }; @@ -1107,7 +1142,8 @@ struct RedOpYZW_complex using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation) + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation) { ARM_COMPUTE_ERROR_ON(axis != 2); ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM); @@ -1124,70 +1160,77 @@ struct RedOpYZW_complex Window in_win_no_pad = in_window; in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + in_win_no_pad, + [&](const Coordinates &) { - neon_vector vec_res_value_0 = { 0 }; - neon_vector vec_res_value_1 = { 0 }; - - vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - - T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); - T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim); + neon_vector vec_res_value_0 = {0}; + neon_vector vec_res_value_1 = {0}; - const auto vec_elements_0 = wrapper::vloadq(in_ptr_0); - const auto vec_elements_1 = wrapper::vloadq(in_ptr_1); + vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0); - vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1); - } + T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim); - wrapper::vstore(out_ptr, vec_res_value_0); - wrapper::vstore(out_ptr + 4, vec_res_value_1); - } + const auto vec_elements_0 = wrapper::vloadq(in_ptr_0); + const auto vec_elements_1 = wrapper::vloadq(in_ptr_1); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto res_value_0 = 0.f; - auto res_value_1 = 0.f; + vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0); + vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1); + } - T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + wrapper::vstore(out_ptr, vec_res_value_0); + wrapper::vstore(out_ptr + 4, vec_res_value_1); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { - T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); - res_value_0 += *in_ptr; - res_value_1 += *(in_ptr + 1); + auto res_value_0 = 0.f; + auto res_value_1 = 0.f; + + T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + res_value_0 += *in_ptr; + res_value_1 += *(in_ptr + 1); + } + *out_ptr = res_value_0; + *(out_ptr + 1) = res_value_1; } - *out_ptr = res_value_0; - *(out_ptr + 1) = res_value_1; - } - }, - input, output); + }, + input, output); } }; template <typename T> struct RedOpYZW_quantized { - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op) + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) { const TensorInfo in_info = *(in->info()); const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); - using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; + using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; const auto oq_info = out->info()->quantization_info().uniform(); @@ -1201,12 +1244,14 @@ struct RedOpYZW_quantized Window in_win_no_pad = in_window; in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); - using vector_type = typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type; + using vector_type = + typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type; using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type; vector_type vec_res_value1{}; @@ -1234,362 +1279,384 @@ struct RedOpYZW_quantized const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{}); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<T *>(input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + in_win_no_pad, + [&](const Coordinates &) { - uint32x4x4_t vec_res_idx{ { 0 } }; - vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); - vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); - vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); - vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + const auto input_ptr = reinterpret_cast<T *>(input.ptr()); - vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); - vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); - vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); - vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + uint32x4x4_t vec_res_idx{{0}}; + vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); - auto vec_res_value = wrapper::vloadq(input_ptr + x); + vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); - for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) - { - const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim; - const auto vec_elements = wrapper::vloadq(in_ptr); - switch(op) + auto vec_res_value = wrapper::vloadq(input_ptr + x); + + for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: + const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim; + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) { - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); - vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); - vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); - vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); - break; + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), + wrapper::traits::vector_128_tag{}); + const auto scale32x4f_4 = + wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); + + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{}); - const auto scale32x4f_4 = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); - - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); - auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); - auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); - auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); + } - //de-quantize vec_elements - temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); - temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); - temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); - temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); - - vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); - vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); - vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); - vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); - break; - } + switch (op) + { case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } case ReductionOperation::ARG_IDX_MAX: { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]); + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]); + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]); + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, + vec_res_idx.val[3]); break; } case ReductionOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } case ReductionOperation::MAX: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value); break; } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch(op) - { - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::ARG_IDX_MAX: - { - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]); - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]); - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]); - wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]); - break; - } - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value); - break; - } - case ReductionOperation::SUM: - { - // Subtract offsets - auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); + case ReductionOperation::SUM: + { + // Subtract offsets + auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); - auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1); - auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2); - auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3); - auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4); + auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1); + auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2); + auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3); + auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4); - vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); - vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); - vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); - vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); + vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); + vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); + vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); + vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); - const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); - const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); - combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x); - break; - } - case ReductionOperation::MEAN_SUM: - { - vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A); - vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A); - vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A); - vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A); + combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x); + break; + } + case ReductionOperation::MEAN_SUM: + { + vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A); + vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A); + vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A); + vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A); #ifdef __aarch64__ - vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f); - vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f); - vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f); - vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f); + vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f); #else // defined(__aarch64__) - vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f); - vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f); - vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f); - vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f); + vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f); #endif // __aarch64__ - const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); - const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); - auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - - wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); - break; - } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{}); - const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); - - //re-quantize - vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); - vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); - vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); - vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); - - vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f); - vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f); - vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f); - vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f); + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); - const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); - auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - - wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); - break; + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = + wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{}); + const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); + + //re-quantize + vec_res_value1_f = + wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); + vec_res_value2_f = + wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); + vec_res_value3_f = + wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); + vec_res_value4_f = + wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); + + vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f); + + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - float res_value = 0.f; - int32_t res_value_q = 0; - - switch(op) + // Compute left-over elements + for (; x < window_end_x; ++x) { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - res_value = *(input_ptr + x); - break; - } - case ReductionOperation::PROD: - { - res_value = static_cast<T>(1.0f); - break; - } - default: - { - res_value = static_cast<T>(0.0f); - break; - } - } - uint32_t res_idx = 0; + float res_value = 0.f; + int32_t res_value_q = 0; - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) - { - const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim); - switch(op) + switch (op) { - case ReductionOperation::SUM: + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: { - res_value += *in_ptr; + res_value = *(input_ptr + x); break; } - case ReductionOperation::MEAN_SUM: + case ReductionOperation::PROD: { - res_value_q += *in_ptr; + res_value = static_cast<T>(1.0f); break; } - case ReductionOperation::SUM_SQUARE: + default: { - res_value += *in_ptr * *in_ptr; + res_value = static_cast<T>(0.0f); break; } - case ReductionOperation::PROD: + } + uint32_t res_idx = 0; + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim); + switch (op) { - //de-quantize input - if(std::is_same<T, uint8_t>::value) + case ReductionOperation::SUM: { - res_value *= dequantize_qasymm8(*in_ptr, iq_info); + res_value += *in_ptr; + break; } - else + case ReductionOperation::MEAN_SUM: { - res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info); + res_value_q += *in_ptr; + break; } - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - if(*in_ptr < res_value) + case ReductionOperation::SUM_SQUARE: { - res_value = *in_ptr; - res_idx = dim; + res_value += *in_ptr * *in_ptr; + break; } - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - if(*in_ptr > res_value) + case ReductionOperation::PROD: { - res_value = *in_ptr; - res_idx = dim; + //de-quantize input + if (std::is_same<T, uint8_t>::value) + { + res_value *= dequantize_qasymm8(*in_ptr, iq_info); + } + else + { + res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info); + } + break; } - break; + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - case ReductionOperation::MIN: + } + + switch (op) + { + case ReductionOperation::MEAN_SUM: { - res_value = *in_ptr < res_value ? *in_ptr : res_value; + // Apply previously calculated coefficients (with rounding on aarch64) +#ifdef __aarch64__ + const int32_t res = + arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B); +#else // defined(__aarch64__) + const int32_t res = A * (static_cast<float>(res_value_q)) + B; +#endif // __aarch64__ + *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res); break; } - case ReductionOperation::MAX: + case ReductionOperation::SUM: { - res_value = *in_ptr > res_value ? *in_ptr : res_value; + // Subtract accumulated offsets + res_value -= (in_info.dimension(axis) - 1) * iq_info.offset; + *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value); break; } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch(op) - { - case ReductionOperation::MEAN_SUM: - { - // Apply previously calculated coefficients (with rounding on aarch64) -#ifdef __aarch64__ - const int32_t res = arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B); -#else // defined(__aarch64__) - const int32_t res = A * (static_cast<float>(res_value_q)) + B; -#endif // __aarch64__ - *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res); - break; - } - case ReductionOperation::SUM: - { - // Subtract accumulated offsets - res_value -= (in_info.dimension(axis) - 1) * iq_info.offset; - *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value); - break; - } - case ReductionOperation::PROD: - { - //re-quantize result - T res = 0; - if(std::is_same<T, uint8_t>::value) + case ReductionOperation::PROD: { - res = quantize_qasymm8(res_value, iq_info); + //re-quantize result + T res = 0; + if (std::is_same<T, uint8_t>::value) + { + res = quantize_qasymm8(res_value, iq_info); + } + else + { + res = quantize_qasymm8_signed(res_value, iq_info); + } + *(reinterpret_cast<T *>(output.ptr() + x)) = res; + break; } - else + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::ARG_IDX_MAX: { - res = quantize_qasymm8_signed(res_value, iq_info); + *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx; + break; } - *(reinterpret_cast<T *>(output.ptr() + x)) = res; - break; - } - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::ARG_IDX_MAX: - { - *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx; - break; + default: + *(reinterpret_cast<T *>(output.ptr() + x)) = res_value; } - default: - *(reinterpret_cast<T *>(output.ptr() + x)) = res_value; } - } - }, - input, output); + }, + input, output); } }; -void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op) +void reduce_op( + const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op) { const bool is_complex = (input->info()->num_channels() == 2); - if(is_complex) + if (is_complex) { - switch(axis) + switch (axis) { case 2: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::F32: - switch(op) + switch (op) { case ReductionOperation::SUM: - return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op); + return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ( + window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), + op); default: ARM_COMPUTE_ERROR("Not supported"); } @@ -1602,19 +1669,21 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi return; } - switch(axis) + switch (axis) { case 0: { - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: { - return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op); + return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, + RedOpX_quantized<uint8_t>(), op); } case DataType::QASYMM8_SIGNED: { - return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op); + return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), + op); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: @@ -1635,19 +1704,22 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi } } case 1: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: { - return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op); + return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, + RedOpYZW_quantized<uint8_t>(), op); } case DataType::QASYMM8_SIGNED: { - return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op); + return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, + RedOpYZW_quantized<int8_t>(), op); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op); + return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), + op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op); @@ -1657,15 +1729,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi ARM_COMPUTE_ERROR("Not supported"); } case 2: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: - return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op); + return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, + RedOpYZW_quantized<uint8_t>(), op); case DataType::QASYMM8_SIGNED: - return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op); + return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, + RedOpYZW_quantized<int8_t>(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op); + return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), + op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op); @@ -1675,15 +1750,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi ARM_COMPUTE_ERROR("Not supported"); } case 3: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: - return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op); + return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, + RedOpYZW_quantized<uint8_t>(), op); case DataType::QASYMM8_SIGNED: - return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op); + return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, + RedOpYZW_quantized<int8_t>(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op); + return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), + op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op); @@ -1704,9 +1782,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - if(input->num_channels() == 1) + if (input->num_channels() == 1) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::S32, DataType::F16, DataType::F32); } else { @@ -1715,13 +1794,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON(axis != 2); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - if(output->total_size() != 0) + if (output->total_size() != 0) { bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN); - if(!is_arg_min_max) + if (!is_arg_min_max) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels()); @@ -1731,8 +1811,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32); } - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); - const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); + const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped); } @@ -1745,7 +1826,10 @@ NEReductionOperationKernel::NEReductionOperationKernel() { } -void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op) +void NEReductionOperationKernel::configure(const ITensor *input, + ITensor *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -1761,14 +1845,23 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output INEKernel::configure(win); // Calculate output shape and set if empty - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); // Output auto initialization if not yet initialized const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX); DataType output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); } -Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status NEReductionOperationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h index 08e654fd21..78bec62c14 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.h +++ b/src/core/NEON/kernels/NEReductionOperationKernel.h @@ -77,7 +77,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp index 1a7f58bb08..f92a4c87da 100644 --- a/src/core/NEON/kernels/NEReorderKernel.cpp +++ b/src/core/NEON/kernels/NEReorderKernel.cpp @@ -24,11 +24,13 @@ #if defined(__aarch64__) #include "src/core/NEON/kernels/NEReorderKernel.h" -#include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/arm_gemm/transform.hpp" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/arm_gemm/transform.hpp" + namespace arm_compute { @@ -37,29 +39,32 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { case DataType::F32: { const int ksize_rows_elements = _xmax * _ksize; - const int jump_rows = ksize_rows_elements * window.x().start(); - const int k_start = window.x().start() * _ksize; - const int k_end = std::min(window.x().end() * _ksize, _kmax); - const int stride = _kmax; - if(k_start < k_end) + const int jump_rows = ksize_rows_elements * window.x().start(); + const int k_start = window.x().start() * _ksize; + const int k_end = std::min(window.x().end() * _ksize, _kmax); + const int stride = _kmax; + if (k_start < k_end) { - - switch(_output_wf) + switch (_output_wf) { case WeightFormat::OHWIo4: { - arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax); + arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>( + reinterpret_cast<float *>(_output->buffer()) + jump_rows, + reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax); break; } #if defined(ARM_COMPUTE_ENABLE_SVE) case WeightFormat::OHWIo8: { - arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax); + arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>( + reinterpret_cast<float *>(_output->buffer()) + jump_rows, + reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax); break; } #endif /* ARM_COMPUTE_ENABLE_SVE */ @@ -78,11 +83,20 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info) } NEReorderKernel::NEReorderKernel() - : _input(nullptr), _output(nullptr), _ksize(0), _kmax(0), _xmax(0), _input_wf(WeightFormat::ANY), _output_wf(WeightFormat::ANY) + : _input(nullptr), + _output(nullptr), + _ksize(0), + _kmax(0), + _xmax(0), + _input_wf(WeightFormat::ANY), + _output_wf(WeightFormat::ANY) { } -void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +void NEReorderKernel::configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { ARM_COMPUTE_LOG_PARAMS(input, output, input_wf, output_wf); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -96,7 +110,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu // Setting parameters for transform auto dims = input->info()->num_dimensions(); - switch(dims) + switch (dims) { case 2: { @@ -120,7 +134,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu // Window size is set by rows / _ksize Window win; int window_size = 0; - switch(_output_wf) + switch (_output_wf) { #if defined(ARM_COMPUTE_ENABLE_SVE) case WeightFormat::OHWIo8: @@ -142,7 +156,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu break; } } - if(_kmax % _ksize != 0) + if (_kmax % _ksize != 0) { window_size += 1; } @@ -152,11 +166,14 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu INEKernel::configure(win); } -Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +Status NEReorderKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -167,20 +184,20 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou int output_x_dim; int output_k_dim; auto dims = output->num_dimensions(); - switch(dims) + switch (dims) { case 2: { - input_x_dim = input->dimension(0); // Number of columns in input matrix - input_k_dim = input->dimension(1); // Number of rows in input matrix + input_x_dim = input->dimension(0); // Number of columns in input matrix + input_k_dim = input->dimension(1); // Number of rows in input matrix output_x_dim = output->dimension(0); // Number of columns in output matrix output_k_dim = output->dimension(1); // Number of rows in output matrix break; } case 4: { - input_x_dim = input->dimension(2); // Number of columns in input matrix - input_k_dim = input->dimension(3); // Number of rows in input matrix + input_x_dim = input->dimension(2); // Number of columns in input matrix + input_k_dim = input->dimension(3); // Number of rows in input matrix output_x_dim = output->dimension(2); // Number of columns in output matrix output_k_dim = output->dimension(3); // Number of rows in output matrix break; @@ -192,7 +209,7 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou } int ksize; - switch(output_wf) + switch (output_wf) { case WeightFormat::OHWIo8: { @@ -216,11 +233,10 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim); // output x_dim needs to be same as input ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim); - } return Status{}; } } // namespace arm_compute -#endif // defined(__aarch64__)
\ No newline at end of file +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/NEReorderKernel.h b/src/core/NEON/kernels/NEReorderKernel.h index 07908890f4..4528b25245 100644 --- a/src/core/NEON/kernels/NEReorderKernel.h +++ b/src/core/NEON/kernels/NEReorderKernel.h @@ -26,9 +26,10 @@ #ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL #define ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL -#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/NEON/INEKernel.h" + namespace arm_compute { @@ -36,7 +37,6 @@ namespace arm_compute class NEReorderKernel : public INEKernel { public: - const char *name() const override { return "NEReorderKernel"; @@ -62,7 +62,10 @@ public: * @param[in] input_wf WeightFormat of input. * @param[in] output_wf WeightFormat of output. */ - void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf); + void configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); /** Static function to check if given info will lead to a valid configuration of @ref NEReorderKernel * @@ -73,25 +76,27 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - -/*****************************************************************************/ + /*****************************************************************************/ private: - const ITensor *_input{nullptr}; // Input tensor - ITensor *_output{nullptr}; // Output tensor - int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call - int32_t _kmax{0}; // Rows in input tensor - int32_t _xmax{0}; // Columns in input tensor - WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor - WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor + const ITensor *_input{nullptr}; // Input tensor + ITensor *_output{nullptr}; // Output tensor + int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call + int32_t _kmax{0}; // Rows in input tensor + int32_t _xmax{0}; // Columns in input tensor + WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor + WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor }; } // namespace arm_compute #endif /* ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL */ -#endif // defined(__aarch64__)
\ No newline at end of file +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp index a7b830c066..227570405c 100644 --- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp +++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,13 +51,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, + "The width of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, + "The height of the input tensor must be a multiple of stride"); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); + const TensorInfo tensor_info_output = + output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -65,8 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -NEReorgLayerKernel::NEReorgLayerKernel() - : _input(nullptr), _output(nullptr), _stride(1) +NEReorgLayerKernel::NEReorgLayerKernel() : _input(nullptr), _output(nullptr), _stride(1) { } @@ -121,23 +124,26 @@ void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info) Iterator out(_output, collapsed_window); // Perform reorg - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - // Get spatial coords and channels - const unsigned int w = id[idx_w]; - const unsigned int h = id[idx_h]; - const unsigned int c = id[idx_c]; - - // Calculate mapping - const unsigned int offset = c / out_c; - Coordinates map_coords = id; - map_coords.set(idx_w, w * stride + offset % stride); - map_coords.set(idx_h, h * stride + offset / stride); - map_coords.set(idx_c, c % out_c); - - // Perform mapping - std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size()); - }, - out); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + // Get spatial coords and channels + const unsigned int w = id[idx_w]; + const unsigned int h = id[idx_h]; + const unsigned int c = id[idx_c]; + + // Calculate mapping + const unsigned int offset = c / out_c; + Coordinates map_coords = id; + map_coords.set(idx_w, w * stride + offset % stride); + map_coords.set(idx_h, h * stride + offset / stride); + map_coords.set(idx_c, c % out_c); + + // Perform mapping + std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), + _input->info()->element_size()); + }, + out); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp index ca6c117882..d2437eecd0 100644 --- a/src/core/NEON/kernels/NEReverseKernel.cpp +++ b/src/core/NEON/kernels/NEReverseKernel.cpp @@ -26,15 +26,17 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) { ARM_COMPUTE_UNUSED(use_inverted_axis); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis); @@ -42,11 +44,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Current implementation only supports up to 4 dimensions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, + "Current implementation only supports up to 4 dimensions."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -57,8 +60,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -NEReverseKernel::NEReverseKernel() - : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false) +NEReverseKernel::NEReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false) { } @@ -80,7 +82,10 @@ void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITe INEKernel::configure(calculate_max_window(*output->info())); } -Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) +Status NEReverseKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis)); @@ -88,29 +93,30 @@ Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *ou } template <typename T> -void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis) +void run_reverse( + const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis) { unsigned int axis_bit = 0; const int rank = input->info()->num_dimensions(); - for(unsigned int i = 0; i < axis->info()->dimension(0); ++i) + for (unsigned int i = 0; i < axis->info()->dimension(0); ++i) { int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i); // The values of axis tensor must be between [-rank, rank-1]. - if((axis_i < -rank) || (axis_i >= rank)) + if ((axis_i < -rank) || (axis_i >= rank)) { ARM_COMPUTE_ERROR("the valuses of the axis tensor must be within [-rank, rank-1]."); } // In case of negative axis value i.e targeted axis(i) = rank + axis(i) - if(axis_i < 0) + if (axis_i < 0) { axis_i = rank + axis_i; } // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis - if(use_inverted_axis) + if (use_inverted_axis) { axis_i = (rank - 1) - axis_i; } @@ -127,43 +133,47 @@ void run_reverse(const Window &window, const ITensor *input, const ITensor *axis win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input_it(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x); - - // Reverse 0 axis - if(axis_bit & 0x1) + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - in = wrapper::vrev64(in); - in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in)); + auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x); + + // Reverse 0 axis + if (axis_bit & 0x1) + { + in = wrapper::vrev64(in); + in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in)); + } + + const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x; + const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); + const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); + const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; + + auto out_ptr = + reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))); + wrapper::vstore(out_ptr, in); } - const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x; - const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); - const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); - const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; - - auto out_ptr = reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))); - wrapper::vstore(out_ptr, in); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x); - const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x; - const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); - const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); - const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; + const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x; + const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); + const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); + const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; - *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in; - } - }, - input_it); + *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = + in; + } + }, + input_it); } void NEReverseKernel::run(const Window &window, const ThreadInfo &info) @@ -172,7 +182,7 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_input->info()->element_size()) + switch (_input->info()->element_size()) { case 4: run_reverse<uint32_t>(window, _input, _axis, _output, _use_inverted_axis); diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h index 7d9ec4691c..92261887f4 100644 --- a/src/core/NEON/kernels/NEReverseKernel.h +++ b/src/core/NEON/kernels/NEReverseKernel.h @@ -68,7 +68,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp index b8c9b244ee..7789b828ea 100644 --- a/src/core/NEON/kernels/NESelectKernel.cpp +++ b/src/core/NEON/kernels/NESelectKernel.cpp @@ -29,13 +29,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" - +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/select/list.h" #include <arm_neon.h> @@ -54,7 +53,8 @@ struct SelectKernelSelectorData }; using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type; -using KernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type; +using KernelPtr = + std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type; struct SelectKernelSelector { @@ -63,95 +63,62 @@ struct SelectKernelSelector KernelPtr ukernel; }; -static const SelectKernelSelector available_kernels[] = -{ - { - "neon_s8_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank) - }, - { - "neon_s16_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank) - }, - { - "neon_s32_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank) - }, - { - "neon_u8_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank) - }, - { - "neon_u16_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank) - }, - { - "neon_u32_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank) - }, - { - "neon_s8_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank) - }, - { - "neon_s16_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank) - }, - { - "neon_s32_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank) - }, - { - "neon_u8_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank) - }, - { - "neon_u16_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank) - }, - { - "neon_u32_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank) - }, - { - "neon_f16_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank) - }, - { - "neon_f16_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank) - }, - { - "neon_f32_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank) - }, - { - "neon_f32_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank) - }, +static const SelectKernelSelector available_kernels[] = { + {"neon_s8_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)}, + {"neon_s16_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)}, + {"neon_s32_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)}, + {"neon_u8_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)}, + {"neon_u16_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)}, + {"neon_u32_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)}, + {"neon_s8_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)}, + {"neon_s16_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)}, + {"neon_s32_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)}, + {"neon_u8_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)}, + {"neon_u16_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)}, + {"neon_u32_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)}, + {"neon_f16_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == true; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)}, + {"neon_f16_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == false; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)}, + {"neon_f32_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == true; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)}, + {"neon_f32_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == false; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)}, }; const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -184,7 +151,8 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor INEKernel::configure(win); } -Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) +Status +NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x); @@ -195,9 +163,11 @@ Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, cons const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape())); - ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); + ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && + ((c->tensor_shape().num_dimensions() > 1) || + (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output); @@ -214,7 +184,7 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON(_output == nullptr); ARM_COMPUTE_ERROR_ON(_output->info() == nullptr); - const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank }); + const auto *uk = get_implementation(SelectKernelSelectorData{_output->info()->data_type(), _has_same_rank}); ARM_COMPUTE_ERROR_ON(uk == nullptr); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); uk->ukernel(_c, _x, _y, _output, window); diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h index e82105a68e..4fec42b536 100644 --- a/src/core/NEON/kernels/NESelectKernel.h +++ b/src/core/NEON/kernels/NESelectKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NESELECTKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -82,7 +83,6 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - const ITensor *_c; /**< Condition tensor */ const ITensor *_x; /**< Source tensor 1 */ const ITensor *_y; /**< Source tensor 2 */ diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp index 673eace3c1..da023aeb96 100644 --- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp +++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> #include <cstdint> @@ -41,19 +42,22 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *block_info, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2}); ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2}); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); @@ -64,7 +68,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status validate_arguments_static(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -73,9 +81,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right); + TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input, block_shape_x, block_shape_y, padding_left, padding_right); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -86,14 +95,25 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape } // namespace NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel() - : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _padding_left(), _block_shape_x(), _block_shape_y() + : _input(nullptr), + _block_shape(nullptr), + _paddings(nullptr), + _output(nullptr), + _data_layout(DataLayout::UNKNOWN), + _padding_left(), + _block_shape_x(), + _block_shape_y() { } -void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output) +void NESpaceToBatchLayerKernel::configure(const ITensor *input, + const ITensor *block_shape, + const ITensor *paddings, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); _input = input; _block_shape = block_shape; @@ -106,15 +126,22 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *b ICPPKernel::configure(win); } -void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, - ITensor *output) +void NESpaceToBatchLayerKernel::configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input->info(), block_shape_x, block_shape_y, padding_left, padding_right); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, + padding_right, output->info())); _input = input; _output = output; @@ -128,15 +155,23 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_ INEKernel::configure(win); } -Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output)); return Status{}; } -Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -146,17 +181,17 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - if(_block_shape != nullptr) + if (_block_shape != nullptr) { // Retrieve the block shapes dynamically _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0))); _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1))); } - if(_paddings != nullptr) + if (_paddings != nullptr) { - const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 0, 0 })); - const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 1, 0 })); + const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({0, 0})); + const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({1, 0})); _padding_left = Size2D(pad_left_x, pad_left_y); } const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); @@ -173,57 +208,61 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t out_x = id.x(); - const size_t out_y = id.y(); - const size_t z = id.z(); - const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; - const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; - if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + execute_window_loop( + slice_out, + [&](const Coordinates &id) { - const int w = batch_id % batch_size; - const int in_x = pos_x - _padding_left.x(); - const int in_y = pos_y - _padding_left.y(); - Coordinates input_coords{ in_x, in_y, z, w }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - } - }, - out); + const size_t out_x = id.x(); + const size_t out_y = id.y(); + const size_t z = id.z(); + const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; + const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; + if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && + pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + { + const int w = batch_id % batch_size; + const int in_x = pos_x - _padding_left.x(); + const int in_y = pos_y - _padding_left.y(); + Coordinates input_coords{in_x, in_y, z, w}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + } + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } else { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t out_x = id.y(); - const size_t out_y = id.z(); - const size_t z = id.x(); - const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; - const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; - if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + execute_window_loop( + slice_out, + [&](const Coordinates &id) { - const int w = batch_id % batch_size; - const int in_x = pos_x - _padding_left.x(); - const int in_y = pos_y - _padding_left.y(); - Coordinates input_coords{ z, in_x, in_y, w }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - } - }, - out); + const size_t out_x = id.y(); + const size_t out_y = id.z(); + const size_t z = id.x(); + const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; + const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; + if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && + pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + { + const int w = batch_id % batch_size; + const int in_x = pos_x - _padding_left.x(); + const int in_y = pos_y - _padding_left.y(); + Coordinates input_coords{z, in_x, in_y, w}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + } + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h index 44b8cbb514..6292c07136 100644 --- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h +++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -69,7 +70,12 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output); + void configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -79,7 +85,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -91,7 +100,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp index 7687c50c40..b49c5ee344 100644 --- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp +++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> #include <cstdint> @@ -50,7 +51,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -115,43 +116,45 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t channel_id = id.z(); - const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape; - const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape; - const int z = channel_id % channel_size; - Coordinates input_coords{ in_x, in_y, z, batch_id }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const size_t channel_id = id.z(); + const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{in_x, in_y, z, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } else { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t channel_id = id.x(); - const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape; - const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape; - const int z = channel_id % channel_size; - Coordinates input_coords{ z, in_x, in_y, batch_id }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const size_t channel_id = id.x(); + const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{z, in_x, in_y, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h index 953b68a401..7d147c5b94 100644 --- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h +++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp index 93080e2ac7..e23b40a9aa 100644 --- a/src/core/NEON/kernels/NEStackLayerKernel.cpp +++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp @@ -25,13 +25,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -41,7 +41,11 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. @@ -50,9 +54,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + compute_stack_shape(*input, axis, num_tensors)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } @@ -60,7 +65,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors))); @@ -71,11 +77,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi return std::make_pair(Status{}, win); } -inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input) +inline Coordinates +shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input) { constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D Coordinates id_out = id; - for(unsigned int i = max_out_coord - 1; i > axis; --i) + for (unsigned int i = max_out_coord - 1; i > axis; --i) { id_out.set(i, id[i - 1]); } @@ -84,12 +91,12 @@ inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, } } // namespace -NEStackLayerKernel::NEStackLayerKernel() - : _input(nullptr), _output(nullptr), _axis(), _idx_input() +NEStackLayerKernel::NEStackLayerKernel() : _input(nullptr), _output(nullptr), _axis(), _idx_input() { } -void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output) +void NEStackLayerKernel::configure( + const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info())); @@ -106,10 +113,15 @@ void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsi INEKernel::configure(win_config.second); } -Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status NEStackLayerKernel::validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); return Status{}; } @@ -131,12 +143,15 @@ void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info) const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0; const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0; - execute_window_loop(window, [&](const Coordinates & id) - { - Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input); - const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k; - std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size()); - }, - input); + execute_window_loop( + window, + [&](const Coordinates &id) + { + Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input); + const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + + id_out[4] * stride_k; + std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size()); + }, + input); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h index 9b36518e4d..685812b56d 100644 --- a/src/core/NEON/kernels/NEStackLayerKernel.h +++ b/src/core/NEON/kernels/NEStackLayerKernel.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_NESTACKLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -64,7 +65,8 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output); + void configure( + const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel * * @note Supported input tensor rank: up to 4 @@ -78,7 +80,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output); // Inherited methods overridden void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp index 2b406a8b8b..efff51be9d 100644 --- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp +++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,9 +39,14 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); @@ -49,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) - { - return i == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; })); // Get expected output shape - const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0); // Checks output if configured - if(output->total_size() != 0) + if (output->total_size() != 0) { const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info); @@ -71,14 +74,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, return Status{}; } -std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { // Output tensor auto initialization if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); // Create window @@ -88,38 +95,49 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input } } // namespace -NEStridedSliceKernel::NEStridedSliceKernel() - : _starts_abs(), _final_strides(), _shrink_mask() +NEStridedSliceKernel::NEStridedSliceKernel() : _starts_abs(), _final_strides(), _shrink_mask() { } -void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSliceKernel::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); _shrink_mask = shrink_axis_mask; const TensorShape &input_shape = input->tensor_shape(); Coordinates ends_abs; - std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords( - input_shape, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + std::tie(_starts_abs, ends_abs, _final_strides) = + arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides, + begin_mask, end_mask, shrink_axis_mask); // Configure kernel window - auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + auto win_config = + validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } -Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSliceKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), - starts, ends, strides, begin_mask, end_mask, shrink_axis_mask) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), starts, ends, + strides, begin_mask, end_mask, shrink_axis_mask) + .first); return Status{}; } @@ -156,7 +174,7 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co size_t length_x = win.shape()[0]; - if(_final_strides[0] == 1 && !is_shrink_x) + if (_final_strides[0] == 1 && !is_shrink_x) { win.set(Window::DimX, Window::Dimension(0, 1, 1)); width_size = width_size * length_x; @@ -183,16 +201,17 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co uint8_t *cur_ptr; execute_window_loop( - win, [&](const Coordinates & id) - { - cur_ptr = input_base; - cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0; - cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1; - cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2; - cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3; - - std::copy_n(cur_ptr, width_size, output_it.ptr()); - }, - output_it); + win, + [&](const Coordinates &id) + { + cur_ptr = input_base; + cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0; + cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1; + cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2; + cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3; + + std::copy_n(cur_ptr, width_size, output_it.ptr()); + }, + output_it); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h index 9ce517417d..a475f09a17 100644 --- a/src/core/NEON/kernels/NEStridedSliceKernel.h +++ b/src/core/NEON/kernels/NEStridedSliceKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" #include <cstdint> @@ -68,9 +69,14 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + void configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel * @@ -86,9 +92,14 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp index 94256dc12d..577ce5b69e 100644 --- a/src/core/NEON/kernels/NETileKernel.cpp +++ b/src/core/NEON/kernels/NETileKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,15 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4); ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) - { - return e == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; })); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -59,8 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -NETileKernel::NETileKernel() - : _input(nullptr), _output(nullptr) +NETileKernel::NETileKernel() : _input(nullptr), _output(nullptr) { } @@ -95,8 +93,9 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - Window output_window{ window }; - output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0))); + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); Window out_slice = output_window.first_slice_window_1D(); const auto src_shape = _input->info()->tensor_shape(); @@ -104,17 +103,19 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info) { Iterator output_it(_output, out_slice); - execute_window_loop(out_slice, [&](const Coordinates & id) - { - const size_t x = id.x(); - const size_t y = id.y(); - const size_t z = id.z(); - const size_t w = id[3]; - Coordinates input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] }; - memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size()); - }, - output_it); - } - while(output_window.slide_window_slice_1D(out_slice)); + execute_window_loop( + out_slice, + [&](const Coordinates &id) + { + const size_t x = id.x(); + const size_t y = id.y(); + const size_t z = id.z(); + const size_t w = id[3]; + Coordinates input_coords{x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3]}; + memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), + _input->info()->dimension(0) * _input->info()->element_size()); + }, + output_it); + } while (output_window.slide_window_slice_1D(out_slice)); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp index dbd47ccfa9..13c2d314e4 100644 --- a/src/core/NEON/kernels/assembly/depthwise.hpp +++ b/src/core/NEON/kernels/assembly/depthwise.hpp @@ -38,9 +38,8 @@ struct DepthwiseConfig DepthwiseMethod method = DepthwiseMethod::DEFAULT; std::string filter = ""; - DepthwiseConfig(DepthwiseMethod method) - : method(method) {}; - DepthwiseConfig() {}; + DepthwiseConfig(DepthwiseMethod method) : method(method){}; + DepthwiseConfig(){}; }; struct DepthwiseArgs @@ -63,18 +62,24 @@ struct DepthwiseArgs bool fast_mode = false; - DepthwiseArgs( - const CPUInfo *cpu_info, - unsigned int kernel_rows, unsigned int kernel_cols, - unsigned int stride_rows, unsigned int stride_cols, - unsigned int dilation_rows, unsigned int dilation_cols, - unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, - unsigned int input_channels, - unsigned int output_rows, unsigned int output_cols, - unsigned int channel_multiplier, - PaddingValues padding, arm_gemm::Activation activation, - - const DepthwiseConfig *config) + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int dilation_rows, + unsigned int dilation_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + + const DepthwiseConfig *config) : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), @@ -95,20 +100,38 @@ struct DepthwiseArgs { } - DepthwiseArgs( - const CPUInfo *cpu_info, - unsigned int kernel_rows, unsigned int kernel_cols, - unsigned int stride_rows, unsigned int stride_cols, - unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, - unsigned int input_channels, - unsigned int output_rows, unsigned int output_cols, - unsigned int channel_multiplier, - PaddingValues padding, arm_gemm::Activation activation, - const DepthwiseConfig *config) - : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows, - stride_cols, 1, 1, n_batches, input_rows, input_cols, - input_channels, output_rows, output_cols, - channel_multiplier, padding, activation, config) + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + const DepthwiseConfig *config) + : DepthwiseArgs(cpu_info, + kernel_rows, + kernel_cols, + stride_rows, + stride_cols, + 1, + 1, + n_batches, + input_rows, + input_cols, + input_channels, + output_rows, + output_cols, + channel_multiplier, + padding, + activation, + config) { } }; @@ -127,17 +150,18 @@ struct Tile { } - Tile() - : Tile(nullptr, 0, 0, 0) + Tile() : Tile(nullptr, 0, 0, 0) { } - void load_from( - const TInput *input, - const unsigned int ld_row, const unsigned int ld_col, - const unsigned int n_rows, const unsigned int n_cols, - const int input_i, const int input_j, - const unsigned int channel_multiplier) const + void load_from(const TInput *input, + const unsigned int ld_row, + const unsigned int ld_col, + const unsigned int n_rows, + const unsigned int n_cols, + const int input_i, + const int input_j, + const unsigned int channel_multiplier) const { const auto pad_top = input_i < 0 ? -input_i : 0; const auto pad_left = input_j < 0 ? -input_j : 0; @@ -145,18 +169,15 @@ struct Tile const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top; const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left; - if(padded_rows < tile_rows || padded_cols < tile_cols) + if (padded_rows < tile_rows || padded_cols < tile_cols) { memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput)); } - do_premultiply<TInput>( - (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, - ld_row, ld_col, - array + pad_top * tile_cols * tile_channels + pad_left * tile_channels, - tile_cols * tile_channels, tile_channels, - padded_rows, padded_cols, tile_channels / channel_multiplier, - channel_multiplier); + do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row, + ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels, + tile_cols * tile_channels, tile_channels, padded_rows, padded_cols, + tile_channels / channel_multiplier, channel_multiplier); } }; @@ -168,9 +189,8 @@ protected: std::string m_name{}; public: - DepthwiseCommon(const DepthwiseArgs &args) - : m_args(args) {}; - DepthwiseCommon(DepthwiseCommon &) = delete; + DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){}; + DepthwiseCommon(DepthwiseCommon &) = delete; DepthwiseCommon &operator=(DepthwiseCommon &) = delete; std::string name() const override @@ -181,19 +201,18 @@ public: void set_name(std::string name) { // Only allow the name to be set once - if(m_name.empty()) + if (m_name.empty()) { m_name = name; } } - void execute( - const void *const input, - const void *const parameters, - void *const output, - void *const working_space, - const unsigned int thread_id, - const unsigned int n_threads) const override final + void execute(const void *const input, + const void *const parameters, + void *const output, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final { const size_t ld_input_col = m_args.input_channels; const size_t ld_input_row = ld_input_col * m_args.input_cols; @@ -202,56 +221,47 @@ public: const size_t ld_output_row = ld_output_col * m_args.output_cols; const size_t ld_output_batch = ld_output_row * m_args.output_rows; - execute( - input, ld_input_col, ld_input_row, ld_input_batch, - parameters, output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads); + execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, n_threads); } - void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *const parameters, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *const working_space, - const unsigned int thread_id, - const unsigned int n_threads) const override final + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *const parameters, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final { - execute( - m_args.n_batches, m_args.input_rows, m_args.input_cols, - m_args.input_channels, m_args.padding, - input, ld_input_col, ld_input_row, ld_input_batch, - parameters, - m_args.output_rows, m_args.output_cols, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads); + execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input, + ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads); } - void execute( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int channels, - const PaddingValues &padding, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - unsigned int output_height, - unsigned int output_width, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const override final + void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &padding, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const override final { // Construct a new set of arguments to reflect that we might have been // passed different input/output tensors. Dilation is handled at this @@ -271,38 +281,33 @@ public: auto ld_output_col_d = ld_output_col * m_args.dilation_cols; auto ld_output_row_d = ld_output_row * m_args.dilation_rows; - for(size_t drow = 0; drow < m_args.dilation_rows; drow++) + for (size_t drow = 0; drow < m_args.dilation_rows; drow++) { size_t start_i; - std::tie(args.output_rows, args.input_rows, start_i, - args.padding.top, args.padding.bottom) = - get_reduced_view_for_dilation( - output_height, input_height, drow, m_args.dilation_rows, - m_args.kernel_rows, m_args.stride_rows, padding.top); + std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) = + get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows, + m_args.kernel_rows, m_args.stride_rows, padding.top); auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row; auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row; - if(args.output_rows) + if (args.output_rows) { - for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) + for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) { size_t start_j; - std::tie(args.output_cols, args.input_cols, start_j, - args.padding.left, args.padding.right) = - get_reduced_view_for_dilation( - output_width, input_width, dcol, m_args.dilation_cols, - m_args.kernel_cols, m_args.stride_cols, padding.left); + std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) = + get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols, + m_args.kernel_cols, m_args.stride_cols, padding.left); const TInput *input_col = input_row + start_j * ld_input_col; TOutput *output_col = output_row + dcol * ld_output_col; - if(args.output_cols) + if (args.output_cols) { - this->execute_internal( - args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters, - output_col, ld_output_col_d, ld_output_row_d, ld_output_batch, - working_space, thread_id, n_threads); + this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, + parameters, output_col, ld_output_col_d, ld_output_row_d, + ld_output_batch, working_space, thread_id, n_threads); } } } @@ -310,20 +315,19 @@ public: } protected: - virtual void execute_internal( - const DepthwiseArgs &instance_args, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; + virtual void execute_internal(const DepthwiseArgs &instance_args, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; virtual bool uses_premultiply() const { diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp index a5db793b3d..5ff848e281 100644 --- a/src/core/NEON/kernels/assembly/depthwise_common.hpp +++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp @@ -49,11 +49,7 @@ struct KernelDescription bool is_default = false; uint64_t cycle_estimate = 0; - KernelDescription( - DepthwiseMethod method, - std::string name, - bool is_default, - uint64_t cycle_estimate) + KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate) : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate) { } @@ -78,58 +74,51 @@ public: // pointer the bias vector (which may be nullptr in the case of no bias) and // a pointer to the array of weights (stored in HWIO order). virtual void pack_parameters( - void *buffer, - const void *biases, - const void *weights, - size_t ld_weight_col = 0, - size_t ld_weight_row = 0) = 0; + void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0; // Determine the amount of working space required virtual size_t get_working_size(unsigned int n_threads) const = 0; // Execute the convolution over the specified area of memory. - virtual void execute( - const void *input, // Pointer to input tensor - const void *parameters, // Packed parameters buffer - void *output, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; - - virtual void execute( - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; - - virtual void execute( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int channels, - const PaddingValues &, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - unsigned int output_height, - unsigned int output_width, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; + virtual void execute(const void *input, // Pointer to input tensor + const void *parameters, // Packed parameters buffer + void *output, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; // To handle a dilation factor of D execute the kernel once for each d in @@ -145,12 +134,13 @@ public: // - Number of valid input pixels corresponding to `d` // - Offset of the first pixel corresponding to `d` // - Amount of padding in the view for `d` -std::tuple<size_t, size_t, size_t, size_t, size_t> -get_reduced_view_for_dilation( - size_t out_size, size_t in_size, - size_t d, size_t dilation_factor, - size_t kernel_size, size_t stride, - size_t pad_before); +std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size, + size_t in_size, + size_t d, + size_t dilation_factor, + size_t kernel_size, + size_t stride, + size_t pad_before); } // namespace depthwise } // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp index f1f70cf1d6..045f9f95d3 100644 --- a/src/core/NEON/kernels/assembly/pool_common.hpp +++ b/src/core/NEON/kernels/assembly/pool_common.hpp @@ -68,45 +68,42 @@ public: virtual size_t get_working_size(unsigned int num_threads) const = 0; // Execute pooling over the specified area of memory. - virtual void execute( - const void *const input, - void *const output, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; - virtual void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; - virtual void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; }; } // namespace pooling diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp index e8db35c593..89d594298e 100644 --- a/src/core/NEON/kernels/assembly/pooling.hpp +++ b/src/core/NEON/kernels/assembly/pooling.hpp @@ -36,9 +36,8 @@ struct PoolingConfig PoolingMethod method = PoolingMethod::DEFAULT; std::string filter = ""; - PoolingConfig(PoolingMethod method) - : method(method) {}; - PoolingConfig() {}; + PoolingConfig(PoolingMethod method) : method(method){}; + PoolingConfig(){}; }; struct PoolingArgs @@ -57,30 +56,40 @@ struct PoolingArgs const PoolingConfig *config; - PoolingArgs( - const CPUInfo *cpu_info, - PoolingType pool_type, - const PoolingWindow &window, - const PoolingStride &stride, - bool exclude_padding, - unsigned int n_batches, - unsigned int input_rows, - unsigned int input_cols, - unsigned int n_channels, - unsigned int output_rows, - unsigned int output_cols, - const PaddingValues &padding, - const PoolingConfig *cfg) - : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols), - n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg) + PoolingArgs(const CPUInfo *cpu_info, + PoolingType pool_type, + const PoolingWindow &window, + const PoolingStride &stride, + bool exclude_padding, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int n_channels, + unsigned int output_rows, + unsigned int output_cols, + const PaddingValues &padding, + const PoolingConfig *cfg) + : cpu_info(cpu_info), + pool_type(pool_type), + pool_window(window), + pool_stride(stride), + exclude_padding(exclude_padding), + n_batches(n_batches), + input_rows(input_rows), + input_cols(input_cols), + n_channels(n_channels), + output_rows(output_rows), + output_cols(output_cols), + padding(padding), + config(cfg) { // If either of the pooling window dimensions are set to zero, meaning // "pool everything", then replace with the corresponding input dimension. - if(pool_window.rows == 0) + if (pool_window.rows == 0) { pool_window.rows = input_rows; } - if(pool_window.cols == 0) + if (pool_window.cols == 0) { pool_window.cols = input_cols; } @@ -100,10 +109,16 @@ struct Requantize32 int32_t per_layer_right_shift = 0; int32_t per_layer_mul = 0; - Requantize32(int32_t input_offset, int32_t output_offset, - int32_t per_layer_left_shift, int32_t per_layer_right_shift, + Requantize32(int32_t input_offset, + int32_t output_offset, + int32_t per_layer_left_shift, + int32_t per_layer_right_shift, int32_t per_layer_mul) - : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul) + : input_offset(input_offset), + output_offset(output_offset), + per_layer_left_shift(per_layer_left_shift), + per_layer_right_shift(per_layer_right_shift), + per_layer_mul(per_layer_mul) { } }; @@ -115,105 +130,88 @@ protected: const PoolingArgs m_args; public: - PoolingCommon(const PoolingArgs &args) - : m_args(args) + PoolingCommon(const PoolingArgs &args) : m_args(args) { } - PoolingCommon(PoolingCommon &) = delete; + PoolingCommon(PoolingCommon &) = delete; PoolingCommon &operator=(PoolingCommon &) = delete; size_t get_working_size(unsigned int) const override = 0; // Execute pooling over the specified area of memory. - void execute( - const void *const input, - void *const output, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute( - input, - m_args.n_channels, - m_args.n_channels * m_args.input_cols, - m_args.n_channels * m_args.input_cols * m_args.input_rows, - output, - m_args.n_channels, - m_args.n_channels * m_args.output_cols, - m_args.n_channels * m_args.output_cols * m_args.output_rows, - working_space, - thread_id, num_threads); + this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols, + m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels, + m_args.n_channels * m_args.output_cols, + m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id, + num_threads); } - void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute( - m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, - input, ld_input_col, ld_input_row, ld_input_batch, - m_args.padding, m_args.output_rows, m_args.output_cols, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, num_threads); + this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col, + ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads); } - void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &padding, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &padding, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute_internal( - batches, height, width, channels, padding, - input, ld_input_col, ld_input_row, ld_input_batch, - output_height, output_width, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, num_threads); + this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row, + ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, num_threads); } protected: - virtual void execute_internal( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const PaddingValues &, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute_internal(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const PaddingValues &, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; }; template <typename TInput, typename TOutput> diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp index 16f26de38a..fb97cf8baf 100644 --- a/src/core/NEON/kernels/assembly/premultiply.hpp +++ b/src/core/NEON/kernels/assembly/premultiply.hpp @@ -44,30 +44,27 @@ void do_premultiply(const T *in_ptr, const unsigned input_channels, const unsigned int channel_multiplier) { - if(sizeof(T) == 4 && channel_multiplier == 6) + if (sizeof(T) == 4 && channel_multiplier == 6) { - do_premultiply_float_6( - (const float *)in_ptr, ld_row, ld_col, - (float *)out_ptr, out_ld_row, out_ld_col, - tile_rows, tile_cols, - input_channels); + do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col, + tile_rows, tile_cols, input_channels); } else { - for(unsigned int i = 0; i < tile_rows; i++) + for (unsigned int i = 0; i < tile_rows; i++) { const T *ip2 = in_ptr + i * ld_row; T *op2 = out_ptr + i * out_ld_row; - for(unsigned int j = 0; j < tile_cols; j++) + for (unsigned int j = 0; j < tile_cols; j++) { const T *ip = ip2; T *op = op2; - for(unsigned int c = 0; c < input_channels; c++) + for (unsigned int c = 0; c < input_channels; c++) { T val = *ip; ip++; - for(unsigned int r = 0; r < channel_multiplier; r++) + for (unsigned int r = 0; r < channel_multiplier; r++) { op[r] = val; } diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp index 50290757ec..dbf95d23cd 100644 --- a/src/core/NEON/kernels/assembly/winograd.hpp +++ b/src/core/NEON/kernels/assembly/winograd.hpp @@ -45,17 +45,24 @@ struct ConvolutionArgs Shape2D kernel_shape; arm_gemm::Activation activation; - ConvolutionArgs( - unsigned int n_batches, - const Shape2D &input_shape, - unsigned int n_input_channels, - unsigned int pad_top, unsigned int pad_left, - const Shape2D &output_shape, - unsigned int n_output_channels, - const Shape2D kernel_shape, - const arm_gemm::Activation &activation = {}) - : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels), - kernel_shape(kernel_shape), activation(activation) + ConvolutionArgs(unsigned int n_batches, + const Shape2D &input_shape, + unsigned int n_input_channels, + unsigned int pad_top, + unsigned int pad_left, + const Shape2D &output_shape, + unsigned int n_output_channels, + const Shape2D kernel_shape, + const arm_gemm::Activation &activation = {}) + : n_batches(n_batches), + input_shape(input_shape), + n_input_channels(n_input_channels), + pad_top(pad_top), + pad_left(pad_left), + output_shape(output_shape), + n_output_channels(n_output_channels), + kernel_shape(kernel_shape), + activation(activation) { } }; @@ -105,23 +112,30 @@ public: virtual unsigned int get_transformed_tile_rows(void) const = 0; virtual unsigned int get_transformed_tile_cols(void) const = 0; - void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, - void *outptr, const WinogradDomainSpec &wds, - unsigned int thread_id, unsigned int n_threads) const + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + const WinogradDomainSpec &wds, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, inptr, ld_in_row, ld_in_col, ld_input_channel, - outptr, wds.weight_ld_matrix, wds.weight_ld_row, - thread_id, n_threads); + this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix, + wds.weight_ld_row, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, - void *outptr, size_t ld_out_matrix, size_t ld_out_row, - unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + size_t ld_out_matrix, + size_t ld_out_row, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace weight_transform @@ -136,27 +150,35 @@ public: virtual unsigned int get_input_rows(void) const = 0; virtual unsigned int get_input_cols(void) const = 0; - virtual size_t get_working_space_size( - const ConvolutionArgs &args, - unsigned int n_threads) const = 0; - - void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, - void *outptr, const WinogradDomainSpec &wds, - void *working_space, unsigned int thread_id, unsigned int n_threads) const + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + const WinogradDomainSpec &wds, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, inptr, ld_in_batch, ld_in_row, ld_in_col, - outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row, - working_space, thread_id, n_threads); + this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix, + wds.input_ld_row, working_space, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, - void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row, - void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + size_t ld_out_batch, + size_t ld_out_matrix, + size_t ld_out_row, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace input_transform @@ -177,31 +199,37 @@ public: virtual unsigned int get_kernel_rows(void) const = 0; virtual unsigned int get_kernel_cols(void) const = 0; - virtual size_t get_working_space_size( - const ConvolutionArgs &args, - unsigned int n_threads) const = 0; - - void execute( - const ConvolutionArgs &args, - const void *inptr, const WinogradDomainSpec &wds, - const void *bias, - void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, - void *working_space, unsigned int thread_id, unsigned int n_threads) const + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + const WinogradDomainSpec &wds, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, - inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, - bias, - outptr, ld_out_batch, ld_out_row, ld_out_col, - working_space, thread_id, n_threads); + this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr, + ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, - const void *bias, - void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, - void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_matrix, + size_t ld_in_row, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace output_transform @@ -210,7 +238,7 @@ struct WinogradImpl { const output_transform::ITransform *output_transform = nullptr; const weight_transform::ITransform *weight_transform = nullptr; - const input_transform::ITransform *input_transform = nullptr; + const input_transform::ITransform *input_transform = nullptr; std::unique_ptr<arm_gemm::GemmArgs> gemm_args; WinogradDomainSpec winograd_spec; }; @@ -220,15 +248,18 @@ struct WinogradImpl * Assigns to the pointers in the `dest` struct and returns true or false to * indicate whether the given problem can be executed or not. */ -template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut> -bool get_implementation( - WinogradImpl &dest, // Destination for the selected implementation - const CPUInfo *, - const ConvolutionArgs &, - int max_threads, - bool fast_mode, - const WinogradConfig *, - const arm_gemm::GemmConfig *); +template <typename TIn, + typename TWeight = TIn, + typename TOut = TIn, + typename TWinogradIn = TIn, + typename TWinogradOut = TOut> +bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation + const CPUInfo *, + const ConvolutionArgs &, + int max_threads, + bool fast_mode, + const WinogradConfig *, + const arm_gemm::GemmConfig *); } // namespace winograd } // namespace arm_conv diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp index ed5254a0a4..e3d9b670b3 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp @@ -24,8 +24,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" + #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> @@ -37,12 +38,26 @@ namespace arm_compute { namespace { -using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window); +using BatchNomalizationPtr = void (*)(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window); template <typename T> -void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>; @@ -57,86 +72,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; T activation_functor(act_info); const auto epsilon_vec = wrapper::vdup_n(static_cast<float16_t>(epsilon), ExactTagType{}); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); - - // Perform core calculations using vector operations - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = wrapper::vloadq(input_mean + x); - const auto var_vec = wrapper::vloadq(input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{}); - const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{}); - - // Calculate denominator - const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if(act_info.enabled()) + const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - activation_functor(res); + // Conctruct vectors + const auto mean_vec = wrapper::vloadq(input_mean + x); + const auto var_vec = wrapper::vloadq(input_var + x); + const auto gamma_vec = (input_gamma != nullptr) + ? wrapper::vloadq(input_gamma + x) + : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{}); + const auto beta_vec = (input_beta != nullptr) + ? wrapper::vloadq(input_beta + x) + : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{}); + + // Calculate denominator + const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); } - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Conctruct vectors - const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; - const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f; - - const float16_t denominator = sqrt(input_var[x] + epsilon); - const float16_t numerator = input_ptr[x] - input_mean[x]; - const float16_t x_bar = numerator / denominator; - float16_t res = beta + x_bar * gamma; - - // Perform fused activation - if(act_info.enabled()) + // Compute left-over elements + for (; x < window_end_x; ++x) { - activation_functor(res); + // Conctruct vectors + const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; + const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f; + + const float16_t denominator = sqrt(input_var[x] + epsilon); + const float16_t numerator = input_ptr[x] - input_mean[x]; + const float16_t x_bar = numerator / denominator; + float16_t res = beta + x_bar * gamma; + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + *reinterpret_cast<float16_t *>(output_ptr + x) = res; } - - // Store results - *reinterpret_cast<float16_t *>(output_ptr + x) = res; - } - }, - input, output); + }, + input, output); } // Fused Batched Normalization with activation functions -static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = -{ - { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>> } -}; -} +static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = { + {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>>}}; +} // namespace namespace cpu { -void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp16_neon_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { - if(act_info.enabled()) + if (act_info.enabled()) { fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window); } diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp index d6e22e1843..4e1654ee6b 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp @@ -24,8 +24,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" + #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> @@ -36,12 +37,26 @@ namespace arm_compute { namespace { -using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window); +using BatchNomalizationPtr = void (*)(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window); template <typename T> -void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; @@ -56,86 +71,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; T activation_functor(act_info); const auto epsilon_vec = wrapper::vdup_n(static_cast<float>(epsilon), ExactTagType{}); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float *>(output.ptr()); - - // Perform core calculations using vector operations - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = wrapper::vloadq(input_mean + x); - const auto var_vec = wrapper::vloadq(input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{}); - const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{}); - - // Calculate denominator - const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if(act_info.enabled()) + const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - activation_functor(res); + // Conctruct vectors + const auto mean_vec = wrapper::vloadq(input_mean + x); + const auto var_vec = wrapper::vloadq(input_var + x); + const auto gamma_vec = (input_gamma != nullptr) + ? wrapper::vloadq(input_gamma + x) + : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{}); + const auto beta_vec = (input_beta != nullptr) + ? wrapper::vloadq(input_beta + x) + : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{}); + + // Calculate denominator + const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); } - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Conctruct vectors - const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; - const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f; - - const float denominator = sqrt(input_var[x] + epsilon); - const float numerator = input_ptr[x] - input_mean[x]; - const float x_bar = numerator / denominator; - float res = beta + x_bar * gamma; - - // Perform fused activation - if(act_info.enabled()) + // Compute left-over elements + for (; x < window_end_x; ++x) { - activation_functor(res); + // Conctruct vectors + const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; + const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f; + + const float denominator = sqrt(input_var[x] + epsilon); + const float numerator = input_ptr[x] - input_mean[x]; + const float x_bar = numerator / denominator; + float res = beta + x_bar * gamma; + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + *reinterpret_cast<float *>(output_ptr + x) = res; } - - // Store results - *reinterpret_cast<float *>(output_ptr + x) = res; - } - }, - input, output); + }, + input, output); } // Fused Batched Normalization with activation functions -static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = -{ - { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>> } -}; -} +static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = { + {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>>}}; +} // namespace namespace cpu { -void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp32_neon_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { - if(act_info.enabled()) + if (act_info.enabled()) { fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window); } diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp index 98cd9aa7fe..48caaa3e63 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/SVEMath.h" #include <cmath> @@ -37,8 +38,15 @@ namespace arm_compute { namespace cpu { -void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp16_sve_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); @@ -49,69 +57,74 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; const auto epsilon_vec = svdup_n_f16(epsilon); const auto const_1 = svdup_n_f16(1.f); const auto const_0 = svdup_n_f16(0.f); const auto va = svdup_n_f16(act_info.a()); const auto vb = svdup_n_f16(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = svld1_f16(pg, input_mean + x); - const auto var_vec = svld1_f16(pg, input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1; - const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0; + const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); - // Calculate denominator - const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec); - auto denominator = svrsqrte_f16(tmp); - denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); - denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Conctruct vectors + const auto mean_vec = svld1_f16(pg, input_mean + x); + const auto var_vec = svld1_f16(pg, input_var + x); + const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1; + const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0; - // Calculate x bar - const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec); - const auto x_bar = svmul_f16_z(pg, numerator, denominator); - auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec); + // Calculate denominator + const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec); + auto denominator = svrsqrte_f16(tmp); + denominator = + svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); + denominator = + svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); - // Perform fused activation - if(act_info.enabled()) - { - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) - { - res = svmax_f16_z(pg, const_0, res); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res)); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Calculate x bar + const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec); + const auto x_bar = svmul_f16_z(pg, numerator, denominator); + auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) { - res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res)); + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + res = svmax_f16_z(pg, const_0, res); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res)); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res)); + } } - } - // Store results - svst1_f16(pg, output_ptr + x, res); + // Store results + svst1_f16(pg, output_ptr + x, res); - x += svcntw(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - input, output); + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(svptrue_b16(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp index 952ab320bf..df4fbfe607 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/SVEMath.h" #include <cmath> @@ -37,8 +38,15 @@ namespace arm_compute { namespace cpu { -void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp32_sve_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); @@ -49,69 +57,74 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; const auto epsilon_vec = svdup_n_f32(epsilon); const auto const_1 = svdup_n_f32(1.f); const auto const_0 = svdup_n_f32(0.f); const auto va = svdup_n_f32(act_info.a()); const auto vb = svdup_n_f32(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float *>(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b32(x, window_end_x); - do + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = svld1_f32(pg, input_mean + x); - const auto var_vec = svld1_f32(pg, input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1; - const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0; + const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); - // Calculate denominator - const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec); - auto denominator = svrsqrte_f32(tmp); - denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); - denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + // Conctruct vectors + const auto mean_vec = svld1_f32(pg, input_mean + x); + const auto var_vec = svld1_f32(pg, input_var + x); + const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1; + const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0; - // Calculate x bar - const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec); - const auto x_bar = svmul_f32_z(pg, numerator, denominator); - auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec); + // Calculate denominator + const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec); + auto denominator = svrsqrte_f32(tmp); + denominator = + svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); + denominator = + svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); - // Perform fused activation - if(act_info.enabled()) - { - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) - { - res = svmax_f32_z(pg, const_0, res); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res)); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Calculate x bar + const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec); + const auto x_bar = svmul_f32_z(pg, numerator, denominator); + auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) { - res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res)); + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + res = svmax_f32_z(pg, const_0, res); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res)); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res)); + } } - } - // Store results - svst1_f32(pg, output_ptr + x, res); + // Store results + svst1_f32(pg, output_ptr + x, res); - x += svcntw(); - pg = svwhilelt_b32(x, window_end_x); - } - while(svptest_any(svptrue_b32(), pg)); - }, - input, output); + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + } while (svptest_any(svptrue_b32(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h index 8e0ea36f5a..cbf540bd71 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/list.h +++ b/src/core/NEON/kernels/batchnormalization/impl/list.h @@ -28,9 +28,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \ - void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, \ - float epsilon, ActivationLayerInfo &act_info, const Window &window) +#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \ + void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, \ + const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window) DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_neon_batch_normalization); DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization); diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h index 3900ea62cd..95cdc8f2f9 100644 --- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h +++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -158,8 +159,7 @@ struct logistic * * @param[in] act_info Activation layer information. */ - explicit logistic(ActivationLayerInfo act_info) - : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{})) + explicit logistic(ActivationLayerInfo act_info) : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{})) { ARM_COMPUTE_UNUSED(act_info); } @@ -198,8 +198,7 @@ struct relu * * @param[in] act_info Activation layer information. */ - explicit relu(ActivationLayerInfo act_info) - : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{})) + explicit relu(ActivationLayerInfo act_info) : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{})) { ARM_COMPUTE_UNUSED(act_info); } diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl index ac196d9dbb..50fff04cad 100644 --- a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl +++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IMultiImage.h" #include "arm_compute/core/Utils.h" + #include "src/core/NEON/NEMath.h" #include <arm_neon.h> @@ -50,8 +51,12 @@ constexpr float rgb2u8_red_coef = 0.2126f; constexpr float rgb2u8_green_coef = 0.7152f; constexpr float rgb2u8_blue_coef = 0.0722f; -inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor, - const float rcoef, const float gcoef, const float bcoef) +inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, + const float32x4_t &gcolor, + const float32x4_t &bcolor, + const float rcoef, + const float gcoef, + const float bcoef) { float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef); greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef); @@ -86,8 +91,12 @@ inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out) arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out); } -inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec, - float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec) +inline void rgb_to_yuv_calculation(const float32x4_t &rvec, + const float32x4_t &gvec, + const float32x4_t &bvec, + float32x4_t &yvec, + float32x4_t &uvec, + float32x4_t &vvec) { /* Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' @@ -110,8 +119,12 @@ inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &g vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv); } -inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val, - float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha) +inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, + float32x4_t uvec_val, + const float32x4_t &yyvec_val, + float32x4_t vvec_val, + unsigned char *output_ptr, + const bool alpha) { float32x4x3_t rgb1, rgb2; @@ -126,8 +139,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve // b = 1.8556f*f_u + 0.0000f*f_v; const auto red = vmulq_n_f32(vvec_val, red_coef_bt709); const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709); - const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), - vmulq_n_f32(vvec_val, green_coef2_bt709)); + const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), vmulq_n_f32(vvec_val, green_coef2_bt709)); // Compute the final r,g,b values using y1 for the first texel and y2 for the second one. // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t @@ -144,7 +156,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve uint8x8x3_t u8_rgb; arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb); - if(!alpha) + if (!alpha) { vst3_lane_u8(&output_ptr[0], u8_rgb, 0); vst3_lane_u8(&output_ptr[3], u8_rgb, 4); @@ -177,7 +189,7 @@ inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha) { uint8x16x3_t rgb; - if(alpha) + if (alpha) { const auto tmp = vld4q_u8(ptr); rgb.val[0] = tmp.val[0]; @@ -206,12 +218,12 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto float32x4x4_t fyvec_top, fuvec_top, fvvec_top; float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom; - for(auto i = 0; i < 4; ++i) + for (auto i = 0; i < 4; ++i) { - rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], - fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]); - rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], - fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]); + rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], fyvec_top.val[i], fuvec_top.val[i], + fvvec_top.val[i]); + rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], fyvec_bottom.val[i], + fuvec_bottom.val[i], fvvec_bottom.val[i]); } arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]); @@ -222,9 +234,14 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]); } -inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, - const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, - unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, +inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, + const uint8x16_t &gvec_top, + const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, + const uint8x16_t &gvec_bottom, + const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, + unsigned char *const __restrict out_y_bottom, unsigned char *const __restrict out_uv) { uint8x16x3_t vec_top, vec_bottom; @@ -252,9 +269,14 @@ inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec vst2_u8(out_uv, uvvec); } -inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, - const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, - unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, +inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, + const uint8x16_t &gvec_top, + const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, + const uint8x16_t &gvec_bottom, + const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, + unsigned char *const __restrict out_y_bottom, unsigned char *const __restrict out_u, unsigned char *const __restrict out_v) { @@ -273,14 +295,16 @@ inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]); const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]); - const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), - vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); + const auto uvvec = + vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); vst1_u8(out_u, vget_low_u8(uvvec)); vst1_u8(out_v, vget_high_u8(uvvec)); } -inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec, +inline void store_rgb_to_yuv4(const uint8x16_t &rvec, + const uint8x16_t &gvec, + const uint8x16_t &bvec, unsigned char *const __restrict out_y, unsigned char *const __restrict out_u, unsigned char *const __restrict out_v) @@ -291,10 +315,9 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec); float32x4x4_t fyvec, fuvec, fvvec; - for(auto i = 0; i < 4; ++i) + for (auto i = 0; i < 4; ++i) { - rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], - fyvec.val[i], fuvec.val[i], fvvec.val[i]); + rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], fyvec.val[i], fuvec.val[i], fvvec.val[i]); } uint8x16_t yvec, uvec, vvec; @@ -307,7 +330,7 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co vst1q_u8(out_v, vvec); } #endif /* DOXYGEN_SKIP_THIS */ -} +} // namespace namespace arm_compute { @@ -329,17 +352,19 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld3q_u8(in.ptr()); - uint8x16x4_t ta2; - ta2.val[0] = ta1.val[0]; - ta2.val[1] = ta1.val[1]; - ta2.val[2] = ta1.val[2]; - ta2.val[3] = vdupq_n_u8(255); - vst4q_u8(out.ptr(), ta2); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16x4_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + ta2.val[3] = vdupq_n_u8(255); + vst4q_u8(out.ptr(), ta2); + }, + in, out); } /** Convert RGB to U8. @@ -360,14 +385,16 @@ void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict outpu Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld3q_u8(in.ptr()); - uint8x16_t ta2; - rgb_to_u8_conversion(ta1, ta2); - vst1q_u8(out.ptr(), ta2); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16_t ta2; + rgb_to_u8_conversion(ta1, ta2); + vst1q_u8(out.ptr(), ta2); + }, + in, out); } /** Convert RGBX to RGB. @@ -388,16 +415,18 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld4q_u8(in.ptr()); - uint8x16x3_t ta2; - ta2.val[0] = ta1.val[0]; - ta2.val[1] = ta1.val[1]; - ta2.val[2] = ta1.val[2]; - vst3q_u8(out.ptr(), ta2); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta1 = vld4q_u8(in.ptr()); + uint8x16x3_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + vst3q_u8(out.ptr(), ta2); + }, + in, out); } /** Convert YUYV to RGB. @@ -422,26 +451,32 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta = vld4q_u8(in.ptr()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - // Convert the uint8x16x4_t to float32x4x4_t - const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); - const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); - const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); - const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); - - yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta = vld4q_u8(in.ptr()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + // Convert the uint8x16x4_t to float32x4x4_t + const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); + const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); + const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); + const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); + + yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, + alpha); + yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, + alpha); + yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, + alpha); + yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, + alpha); + }, + in, out); } /** Convert NV12 to RGB. @@ -475,35 +510,45 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out Iterator in_uv(input_ptr->plane(1), win_uv); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); - - yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - - yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); - }, - in_y, in_uv, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); + + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], + out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], + out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], + out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], + out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], + out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], + out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], + out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], + out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_uv, out); } /** Convert IYUV to RGB. @@ -537,59 +582,71 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out Iterator in_v(input_ptr->plane(2), win_uv); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto *y_top_ptr = in_y.ptr(); - const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); - const auto *u_ptr = in_u.ptr(); - const auto *v_ptr = in_v.ptr(); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto *y_top_ptr = in_y.ptr(); + const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); + const auto *u_ptr = in_u.ptr(); + const auto *v_ptr = in_v.ptr(); // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation #if defined(__arch64__) - const auto ta0_y_top = vld1q_u8(y_top_ptr); - const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); - const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); - const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); - const auto ta_u = vld1q_u8(u_ptr); - const auto ta_v = vld1q_u8(v_ptr); - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); + const auto ta0_y_top = vld1q_u8(y_top_ptr); + const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); + const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); + const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yvec_bottom = + arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t yyvec_bottom = + arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); #else /* defined(__arch64__) */ - const auto ta_y_top = vld2q_u8(y_top_ptr); - const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); - const auto ta_u = vld1q_u8(u_ptr); - const auto ta_v = vld1q_u8(v_ptr); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_u.val[0] = U0 U2 U4 U6 ... - //ta_v.val[0] = V0 V2 V4 V6 ... - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); + const auto ta_y_top = vld2q_u8(y_top_ptr); + const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u.val[0] = U0 U2 U4 U6 ... + //ta_v.val[0] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); #endif /* defined(__arch64__) */ - yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - - yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); - }, - in_y, in_u, in_v, out); + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], + out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], + out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], + out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], + out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], + out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], + out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], + out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], + out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_u, in_v, out); } /** Convert YUYV to NV12. @@ -621,31 +678,33 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou Iterator out_y(output_ptr->plane(0), win); Iterator out_uv(output_ptr->plane(1), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_top = vld4q_u8(in.ptr()); - const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - uint8x16x2_t yvec; - yvec.val[0] = ta_top.val[0 + shift]; - yvec.val[1] = ta_top.val[2 + shift]; - vst2q_u8(out_y.ptr(), yvec); - - uint8x16x2_t yyvec; - yyvec.val[0] = ta_bottom.val[0 + shift]; - yyvec.val[1] = ta_bottom.val[2 + shift]; - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); - - uint8x16x2_t uvvec; - uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); - uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); - vst2q_u8(out_uv.ptr(), uvvec); - }, - in, out_y, out_uv); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16x2_t uvvec; + uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst2q_u8(out_uv.ptr(), uvvec); + }, + in, out_y, out_uv); } /** Convert IYUV to NV12. @@ -676,23 +735,25 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou Iterator out_y(output_ptr->plane(0), win); Iterator out_uv(output_ptr->plane(1), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - uint8x16x2_t ta_uv; - ta_uv.val[0] = vld1q_u8(in_u.ptr()); - ta_uv.val[1] = vld1q_u8(in_v.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - vst2q_u8(out_uv.ptr(), ta_uv); - }, - in_y, in_u, in_v, out_y, out_uv); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + uint8x16x2_t ta_uv; + ta_uv.val[0] = vld1q_u8(in_u.ptr()); + ta_uv.val[1] = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst2q_u8(out_uv.ptr(), ta_uv); + }, + in_y, in_u, in_v, out_y, out_uv); } /** Convert NV12 to IYUV. @@ -726,22 +787,24 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win_uv); Iterator out_v(output_ptr->plane(2), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); - vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); - }, - in_y, in_uv, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); + vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); + }, + in_y, in_uv, out_y, out_u, out_v); } /** Convert YUYV to IYUV. @@ -774,34 +837,36 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win_uv); Iterator out_v(output_ptr->plane(2), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_top = vld4q_u8(in.ptr()); - const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - uint8x16x2_t yvec; - yvec.val[0] = ta_top.val[0 + shift]; - yvec.val[1] = ta_top.val[2 + shift]; - vst2q_u8(out_y.ptr(), yvec); - - uint8x16x2_t yyvec; - yyvec.val[0] = ta_bottom.val[0 + shift]; - yyvec.val[1] = ta_bottom.val[2 + shift]; - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); - - uint8x16_t uvec; - uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); - vst1q_u8(out_u.ptr(), uvec); - - uint8x16_t vvec; - vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); - vst1q_u8(out_v.ptr(), vvec); - }, - in, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16_t uvec; + uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + vst1q_u8(out_u.ptr(), uvec); + + uint8x16_t vvec; + vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst1q_u8(out_v.ptr(), vvec); + }, + in, out_y, out_u, out_v); } /** Convert NV12 to YUV4. @@ -835,32 +900,34 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win); Iterator out_v(output_ptr->plane(2), win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - - uint8x16x2_t uvec; - uvec.val[0] = ta_uv.val[0 + shift]; - uvec.val[1] = ta_uv.val[0 + shift]; - vst2q_u8(out_u.ptr(), uvec); - vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); - - uint8x16x2_t vvec; - vvec.val[0] = ta_uv.val[1 - shift]; - vvec.val[1] = ta_uv.val[1 - shift]; - vst2q_u8(out_v.ptr(), vvec); - vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); - }, - in_y, in_uv, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_uv.val[0 + shift]; + uvec.val[1] = ta_uv.val[0 + shift]; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_uv.val[1 - shift]; + vvec.val[1] = ta_uv.val[1 - shift]; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_uv, out_y, out_u, out_v); } /** Convert IYUV to YUV4. @@ -892,33 +959,35 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win); Iterator out_v(output_ptr->plane(2), win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_u = vld1q_u8(in_u.ptr()); - const auto ta_v = vld1q_u8(in_v.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_u = U0 U2 U4 U6 ... - //ta_v = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - - uint8x16x2_t uvec; - uvec.val[0] = ta_u; - uvec.val[1] = ta_u; - vst2q_u8(out_u.ptr(), uvec); - vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); - - uint8x16x2_t vvec; - vvec.val[0] = ta_v; - vvec.val[1] = ta_v; - vst2q_u8(out_v.ptr(), vvec); - vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); - }, - in_y, in_u, in_v, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_u = vld1q_u8(in_u.ptr()); + const auto ta_v = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u = U0 U2 U4 U6 ... + //ta_v = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_u; + uvec.val[1] = ta_u; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_v; + vvec.val[1] = ta_v; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_u, in_v, out_y, out_u, out_v); } /** Convert RGB to NV12. @@ -948,20 +1017,21 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out Iterator out_y(output_ptr->plane(0), win); Iterator out_uv(output_ptr->plane(1), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb_top = load_rgb(in.ptr(), alpha); - const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], - ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], - out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), - out_uv.ptr()); - }, - in, out_y, out_uv); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0], + ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(), + out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_uv.ptr()); + }, + in, out_y, out_uv); } /** Convert RGB to IYUV. @@ -992,20 +1062,22 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out Iterator out_u(output_ptr->plane(1), win_uv); Iterator out_v(output_ptr->plane(2), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb_top = load_rgb(in.ptr(), alpha); - const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], - ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], - out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), - out_u.ptr(), out_v.ptr()); - }, - in, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0], + ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(), + out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_u.ptr(), + out_v.ptr()); + }, + in, out_y, out_u, out_v); } /** Convert RGB to YUV4. @@ -1030,16 +1102,17 @@ void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict out Iterator out_u(output_ptr->plane(1), win); Iterator out_v(output_ptr->plane(2), win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb = load_rgb(in.ptr(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], - out_y.ptr(), out_u.ptr(), out_v.ptr()); - }, - in, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_rgb = load_rgb(in.ptr(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], out_y.ptr(), out_u.ptr(), out_v.ptr()); + }, + in, out_y, out_u, out_v); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h index 96defbc9c9..4b1eb079b2 100644 --- a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h +++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h @@ -33,56 +33,32 @@ namespace detail { inline float32x4x3_t load_matrix_row(const float *ptr) { - const float32x4x3_t r = - { - { - vld1q_dup_f32(ptr), - vld1q_dup_f32(1 + ptr), - vld1q_dup_f32(2 + ptr) - } - }; + const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}}; return r; } template <unsigned int stridex> -float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2); +float32x4x2_t convolve_3x3(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2); template <> -inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +inline float32x4x2_t convolve_3x3<1>(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2) { - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + 4), - vld1q_f32(in_top + 8) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + 4), - vld1q_f32(in_mid + 8) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + 4), - vld1q_f32(in_low + 8) - } - }; - float32x4x2_t out = - { - { - vmulq_f32(vtop.val[0], m0.val[0]), - vmulq_f32(vtop.val[1], m0.val[0]) - } - }; - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}}; + const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}}; + const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}}; + float32x4x2_t out = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}}; + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); @@ -106,7 +82,12 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +inline float32x4x2_t convolve_3x3<2>(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2) { float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); @@ -116,7 +97,12 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +inline float32x4x2_t convolve_3x3<3>(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2) { float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); @@ -165,6 +151,6 @@ int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteratio { return num_elems_written_per_iteration * 3; } -} +} // namespace detail } // namespace arm_compute -#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */ diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h index 7ba52a16b7..fd1ee54597 100644 --- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h +++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h @@ -45,14 +45,7 @@ namespace detail inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) { ARM_COMPUTE_UNUSED(weights_offset); - const float32x4x3_t r = - { - { - vld1q_dup_f32(ptr), - vld1q_dup_f32(1 + ptr), - vld1q_dup_f32(2 + ptr) - } - }; + const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}}; return r; } @@ -63,21 +56,16 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) * * @return The loaded matrix. */ -template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) > +template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)> inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0) { const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - int32x4x3_t r = - { - { - vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), - vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), - vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2))) - } - }; + int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}}; return r; } @@ -245,36 +233,23 @@ inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values * @param[in] input_offset (Optional) Input quantization offset. * */ -inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - const size_t dilation_x, int input_offset) +inline float32x4_t single_convolve_3x3_dilation(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + const size_t dilation_x, + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + dilation_x), - vld1q_f32(in_top + 2 * dilation_x) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + dilation_x), - vld1q_f32(in_mid + 2 * dilation_x) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + dilation_x), - vld1q_f32(in_low + 2 * dilation_x) - } - }; + const float32x4x3_t vtop = { + {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}}; + const float32x4x3_t vmid = { + {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}}; + const float32x4x3_t vlow = { + {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}}; float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]); out = vmlaq_f32(out, vtop.val[1], m0.val[1]); out = vmlaq_f32(out, vtop.val[2], m0.val[2]); @@ -303,26 +278,28 @@ inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float * @param[in] input_offset (Optional) Input quantization offset. * */ -inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset = 0) +inline float32x4x2_t convolve_3x3_dilation(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + const size_t dilation_x, + unsigned int stridex, + int input_offset = 0) { ARM_COMPUTE_ERROR_ON(stridex > 3); - float32x4x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) - } - }; + float32x4x2_t out = { + {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}}; - if(stridex == 2) + if (stridex == 2) { out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); } @@ -344,26 +321,32 @@ inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_ * */ template <bool accumulate> -void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - unsigned int stridex, int input_offset = 0); +void convolve_3x3(const float *in_top, + const float *in_mid, + const float *in_low, + float *out_ptr, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + unsigned int stridex, + int input_offset = 0); template <bool accumulate> -inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - unsigned int stridex, int input_offset) +inline void convolve_3x3(const float *in_top, + const float *in_mid, + const float *in_low, + float *out_ptr, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + unsigned int stridex, + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); ARM_COMPUTE_ERROR_ON(stridex > 3); - float32x4x2_t out = - { - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f) - } - }; - if(stridex == 2) + float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}}; + if (stridex == 2) { const float32x4x2_t vtop = vld2q_f32(in_top); const float32x4x2_t vmid = vld2q_f32(in_mid); @@ -389,32 +372,11 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float * } else { - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + 4), - vld1q_f32(in_top + 8) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + 4), - vld1q_f32(in_mid + 8) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + 4), - vld1q_f32(in_low + 8) - } - }; - out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); - out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]); + const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}}; + const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}}; + const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}}; + out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]); out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); @@ -438,7 +400,7 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float * out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); - if(stridex == 3) + if (stridex == 3) { out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); @@ -462,65 +424,43 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float * * @param[in] input_offset Input quantization offset. * */ -template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) > -inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, - const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - size_t dilation_x, int32_t input_offset) +template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)> +inline int32x4_t single_convolve_3x3_dilation(const T *in_top, + const T *in_mid, + const T *in_low, + const int32x4x3_t &m0, + const int32x4x3_t &m1, + const int32x4x3_t &m2, + size_t dilation_x, + int32_t input_offset) { using VectorType = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type; using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>; const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); - const VectorType vtop = - { - { - wrapper::vload(in_top), - wrapper::vload(in_top + dilation_x), - wrapper::vload(in_top + 2 * dilation_x) - } - }; - const VectorType vmid = - { - { - wrapper::vload(in_mid), - wrapper::vload(in_mid + dilation_x), - wrapper::vload(in_mid + 2 * dilation_x) - } - }; - const VectorType vlow = - { - { - wrapper::vload(in_low), - wrapper::vload(in_low + dilation_x), - wrapper::vload(in_low + 2 * dilation_x) - } - }; - - const int32x4x3_t vtop_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))), - } - }; - const int32x4x3_t vmid_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))), - } - }; - const int32x4x3_t vlow_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))), - } - }; + const VectorType vtop = { + {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}}; + const VectorType vmid = { + {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}}; + const VectorType vlow = { + {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}}; + + const int32x4x3_t vtop_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))), + }}; + const int32x4x3_t vmid_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))), + }}; + const int32x4x3_t vlow_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))), + }}; int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]); out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]); @@ -550,26 +490,29 @@ inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, * @param[in] input_offset Input quantization offset. * */ -template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) > -inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset) +template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)> +inline int32x4x2_t convolve_3x3_dilation(const T *in_top, + const T *in_mid, + const T *in_low, + const int32x4x3_t &m0, + const int32x4x3_t &m1, + const int32x4x3_t &m2, + const size_t dilation_x, + unsigned int stridex, + int input_offset) { ARM_COMPUTE_ERROR_ON(stridex > 3); - int32x4x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) - } - }; + int32x4x2_t out = { + {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}}; - if(stridex == 2) + if (stridex == 2) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); } @@ -589,10 +532,19 @@ inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const * @param[in] input_offset Input quantization offset. * */ -template < bool accumulate, typename T1, typename T2, ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) > -void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr, - const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - unsigned int stridex, int32_t input_offset) +template <bool accumulate, + typename T1, + typename T2, + ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value)> +void convolve_3x3(const T1 *in_top, + const T1 *in_mid, + const T1 *in_low, + T2 *out_ptr, + const int32x4x3_t &m0, + const int32x4x3_t &m1, + const int32x4x3_t &m2, + unsigned int stridex, + int32_t input_offset) { ARM_COMPUTE_ERROR_ON(stridex > 3); using VectorType = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type; @@ -600,60 +552,30 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); - const VectorType vtop = - { - { - wrapper::vload(in_top), - wrapper::vload(in_top + 8) - } - }; - const VectorType vmid = - { - { - wrapper::vload(in_mid), - wrapper::vload(in_mid + 8) - } - }; - const VectorType vlow = - { - { - wrapper::vload(in_low), - wrapper::vload(in_low + 8) - } - }; - - const int32x4x3_t vtop_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), - } - }; - const int32x4x3_t vmid_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), - } - }; - const int32x4x3_t vlow_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), - } - }; - - int32x4x2_t out - { - { - wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}), - wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}), - } - }; + const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}}; + const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}}; + const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}}; + + const int32x4x3_t vtop_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + }}; + const int32x4x3_t vmid_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + }}; + const int32x4x3_t vlow_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + }}; + + int32x4x2_t out{{ + wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}), + wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}), + }}; // 0 out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]); @@ -681,11 +603,11 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]); out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]); - if(stridex == 1) + if (stridex == 1) { accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); } - else if(stridex == 2) + else if (stridex == 2) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); @@ -693,7 +615,7 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); @@ -712,14 +634,7 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = ARM_COMPUTE_UNUSED(weights_offset); /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - const float16x8x3_t r = - { - { - vld1q_dup_f16(ptr), - vld1q_dup_f16(1 + ptr), - vld1q_dup_f16(2 + ptr) - } - }; + const float16x8x3_t r = {{vld1q_dup_f16(ptr), vld1q_dup_f16(1 + ptr), vld1q_dup_f16(2 + ptr)}}; return r; } @@ -735,35 +650,22 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = * @param[in] input_offset (Optional)Input quantization offset. * */ -inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - const size_t dilation_x, int input_offset = 0) +inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, + const float16_t *in_mid, + const float16_t *in_low, + const float16x8x3_t &m0, + const float16x8x3_t &m1, + const float16x8x3_t &m2, + const size_t dilation_x, + int input_offset = 0) { ARM_COMPUTE_UNUSED(input_offset); - const float16x8x3_t vtop = - { - { - vld1q_f16(in_top), - vld1q_f16(in_top + dilation_x), - vld1q_f16(in_top + 2 * dilation_x) - } - }; - const float16x8x3_t vmid = - { - { - vld1q_f16(in_mid), - vld1q_f16(in_mid + dilation_x), - vld1q_f16(in_mid + 2 * dilation_x) - } - }; - const float16x8x3_t vlow = - { - { - vld1q_f16(in_low), - vld1q_f16(in_low + dilation_x), - vld1q_f16(in_low + 2 * dilation_x) - } - }; + const float16x8x3_t vtop = { + {vld1q_f16(in_top), vld1q_f16(in_top + dilation_x), vld1q_f16(in_top + 2 * dilation_x)}}; + const float16x8x3_t vmid = { + {vld1q_f16(in_mid), vld1q_f16(in_mid + dilation_x), vld1q_f16(in_mid + 2 * dilation_x)}}; + const float16x8x3_t vlow = { + {vld1q_f16(in_low), vld1q_f16(in_low + dilation_x), vld1q_f16(in_low + 2 * dilation_x)}}; float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]); out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1])); out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2])); @@ -792,19 +694,21 @@ inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const f * @param[in] input_offset (Optional) Input quantization offset. * */ -inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset = 0) -{ - float16x8x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset) - } - }; - - if(stridex == 2) +inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, + const float16_t *in_mid, + const float16_t *in_low, + const float16x8x3_t &m0, + const float16x8x3_t &m1, + const float16x8x3_t &m2, + const size_t dilation_x, + unsigned int stridex, + int input_offset = 0) +{ + float16x8x2_t out = { + {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)}}; + + if (stridex == 2) { out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2); @@ -814,7 +718,7 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1 out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); @@ -838,20 +742,20 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1 * */ template <bool accumulate> -inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - unsigned int stridex, int input_offset = 0) +inline void convolve_3x3(const float16_t *in_top, + const float16_t *in_mid, + const float16_t *in_low, + float16_t *out_ptr, + const float16x8x3_t &m0, + const float16x8x3_t &m1, + const float16x8x3_t &m2, + unsigned int stridex, + int input_offset = 0) { ARM_COMPUTE_UNUSED(input_offset); - float16x8x2_t out = - { - { - vdupq_n_f16(0), - vdupq_n_f16(0) - } - }; - if(stridex == 2) + float16x8x2_t out = {{vdupq_n_f16(0), vdupq_n_f16(0)}}; + if (stridex == 2) { const float16x8x2_t vtop = vld2q_f16(in_top); const float16x8x2_t vmid = vld2q_f16(in_mid); @@ -877,32 +781,11 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const } else { - const float16x8x3_t vtop = - { - { - vld1q_f16(in_top), - vld1q_f16(in_top + 8), - vld1q_f16(in_top + 16) - } - }; - const float16x8x3_t vmid = - { - { - vld1q_f16(in_mid), - vld1q_f16(in_mid + 8), - vld1q_f16(in_mid + 16) - } - }; - const float16x8x3_t vlow = - { - { - vld1q_f16(in_low), - vld1q_f16(in_low + 8), - vld1q_f16(in_low + 16) - } - }; - out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); - out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]); + const float16x8x3_t vtop = {{vld1q_f16(in_top), vld1q_f16(in_top + 8), vld1q_f16(in_top + 16)}}; + const float16x8x3_t vmid = {{vld1q_f16(in_mid), vld1q_f16(in_mid + 8), vld1q_f16(in_mid + 16)}}; + const float16x8x3_t vlow = {{vld1q_f16(in_low), vld1q_f16(in_low + 8), vld1q_f16(in_low + 16)}}; + out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]); out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1])); out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2])); @@ -921,7 +804,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1])); out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2])); - if(stridex == 3) + if (stridex == 3) { out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); @@ -946,7 +829,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const */ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex) { - switch(stridex) + switch (stridex) { case 1: return num_elems_written_per_iteration; @@ -959,6 +842,6 @@ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iter return 0; } } -} +} // namespace detail } // namespace arm_compute #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */ diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h index 1c77a9e9f0..381de2284a 100644 --- a/src/core/NEON/wrapper/intrinsics/cvt.h +++ b/src/core/NEON/wrapper/intrinsics/cvt.h @@ -30,12 +30,11 @@ namespace arm_compute { namespace wrapper { -#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \ - vcvt(const vtype &a) \ - { \ - return prefix##_##postfix1##_##postfix2(a); \ +#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type vcvt(const vtype &a) \ + { \ + return prefix##_##postfix1##_##postfix2(a); \ } VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32) @@ -46,12 +45,11 @@ VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16) #undef VCVT_TO_F32_IMPL #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \ - vcvt(const vtype &a) \ - { \ - return prefix##_##postfix1##_##postfix2(a); \ +#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type vcvt(const vtype &a) \ + { \ + return prefix##_##postfix1##_##postfix2(a); \ } VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32) @@ -59,14 +57,14 @@ VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <typename T> -inline typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t >::type +inline typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvt(const float32x4_t &a) { return vcvtq_u32_f32(a); } template <typename T> -inline typename std::enable_if < std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t >::type +inline typename std::enable_if<std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t>::type vcvt(const float32x4_t &a) { return vcvtq_s32_f32(a); @@ -74,15 +72,13 @@ vcvt(const float32x4_t &a) #ifdef __aarch64__ template <typename T> -inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type -vcvta(const float32x4_t &a) +inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvta(const float32x4_t &a) { return vcvtaq_u32_f32(a); } template <typename T> -inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type -vcvta(const float32x4_t &a) +inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type vcvta(const float32x4_t &a) { return vcvtaq_s32_f32(a); } @@ -96,14 +92,13 @@ vcvta(const float32x4_t &a) */ inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr) { - __asm __volatile( - "ldp q0, q1, [%[inptr]]\n" - ".inst 0xea16800\n" // BFCVTN v0, v0 - ".inst 0x4ea16820\n" // BFCVTN2 v0, v1 - "str q0, [%[outptr]]\n" - : [inptr] "+r"(inptr) - : [outptr] "r"(outptr) - : "v0", "v1", "memory"); + __asm __volatile("ldp q0, q1, [%[inptr]]\n" + ".inst 0xea16800\n" // BFCVTN v0, v0 + ".inst 0x4ea16820\n" // BFCVTN2 v0, v1 + "str q0, [%[outptr]]\n" + : [inptr] "+r"(inptr) + : [outptr] "r"(outptr) + : "v0", "v1", "memory"); } #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h index 265f30d33b..ece991a5b0 100644 --- a/src/core/NEON/wrapper/intrinsics/div.h +++ b/src/core/NEON/wrapper/intrinsics/div.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_DIV_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/erf.h b/src/core/NEON/wrapper/intrinsics/erf.h index e2207648e5..0e34462b96 100644 --- a/src/core/NEON/wrapper/intrinsics/erf.h +++ b/src/core/NEON/wrapper/intrinsics/erf.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_WRAPPER_ERF_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h index c2a6970967..f44577b926 100644 --- a/src/core/NEON/wrapper/intrinsics/exp.h +++ b/src/core/NEON/wrapper/intrinsics/exp.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_EXP_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h index 2052751612..ae813bb2fa 100644 --- a/src/core/NEON/wrapper/intrinsics/getlane.h +++ b/src/core/NEON/wrapper/intrinsics/getlane.h @@ -33,7 +33,7 @@ namespace wrapper #define VGETLANE_IMPL_8(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vget_lane_##postfix(vector, 0); \ @@ -59,7 +59,7 @@ namespace wrapper #define VGETLANE_IMPL_4(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vget_lane_##postfix(vector, 0); \ @@ -77,7 +77,7 @@ namespace wrapper #define VGETLANE_IMPL_2(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vget_lane_##postfix(vector, 0); \ @@ -102,7 +102,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_16(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ @@ -144,7 +144,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_8(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ @@ -170,7 +170,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_4(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ @@ -188,7 +188,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_2(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h index de398b0403..e443be679b 100644 --- a/src/core/NEON/wrapper/intrinsics/inv.h +++ b/src/core/NEON/wrapper/intrinsics/inv.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_INV_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h index 2343efa8f8..257b445cc7 100644 --- a/src/core/NEON/wrapper/intrinsics/invsqrt.h +++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_INVSQRT_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h index 357a77ca78..d091407edb 100644 --- a/src/core/NEON/wrapper/intrinsics/log.h +++ b/src/core/NEON/wrapper/intrinsics/log.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_LOG_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h index 61f834ed23..dfd6ccc358 100644 --- a/src/core/NEON/wrapper/intrinsics/pow.h +++ b/src/core/NEON/wrapper/intrinsics/pow.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_POW_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h index 167f3cf43b..9a0a23a241 100644 --- a/src/core/NEON/wrapper/intrinsics/qmov.h +++ b/src/core/NEON/wrapper/intrinsics/qmov.h @@ -31,15 +31,13 @@ namespace arm_compute namespace wrapper { template <typename T> -inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type -vqmov(const int16x8_t &a) +inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type vqmov(const int16x8_t &a) { return vqmovun_s16(a); } template <typename T> -inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type -vqmov(const int16x8_t &a) +inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type vqmov(const int16x8_t &a) { return vqmovn_s16(a); } diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h index cf00a4aceb..c2c4f720d2 100644 --- a/src/core/NEON/wrapper/intrinsics/reinterpret.h +++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h @@ -35,7 +35,7 @@ namespace wrapper { \ return prefix##_##postfix1##_##postfix2(a); \ } \ - \ + \ inline ptype vreinterpret(const ptype &a) \ { \ return a; \ diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h index d23feb6b42..7789aab770 100644 --- a/src/core/NEON/wrapper/intrinsics/round.h +++ b/src/core/NEON/wrapper/intrinsics/round.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_ROUND_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h index 197eedacb5..259b8eaf90 100644 --- a/src/core/NEON/wrapper/intrinsics/setlane.h +++ b/src/core/NEON/wrapper/intrinsics/setlane.h @@ -33,7 +33,7 @@ namespace wrapper #define VSETLANE_IMPL_8(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vset_lane_##postfix(value, vector, 0); \ @@ -59,7 +59,7 @@ namespace wrapper #define VSETLANE_IMPL_4(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vset_lane_##postfix(value, vector, 0); \ @@ -77,7 +77,7 @@ namespace wrapper #define VSETLANE_IMPL_2(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vset_lane_##postfix(value, vector, 0); \ @@ -102,7 +102,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) #define VSETQLANE_IMPL_16(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vsetq_lane_##postfix(value, vector, 0); \ @@ -144,7 +144,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) #define VSETQLANE_IMPL_8(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vsetq_lane_##postfix(value, vector, 0); \ @@ -170,7 +170,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) #define VSETQLANE_IMPL_4(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vsetq_lane_##postfix(value, vector, 0); \ diff --git a/src/core/NEON/wrapper/intrinsics/shr.h b/src/core/NEON/wrapper/intrinsics/shr.h index 73ca9c56c6..6ccb9cdf92 100644 --- a/src/core/NEON/wrapper/intrinsics/shr.h +++ b/src/core/NEON/wrapper/intrinsics/shr.h @@ -75,7 +75,7 @@ VQRSHRN_SCALAR_IMPL(uint32_t, uint64_t, vqrshrnd_n, u64) { \ return prefix_signed##_##postfix(a, b); \ } \ - \ + \ template <int b, typename T> \ inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \ vqrshrn_ex(const vtype &a) \ @@ -128,7 +128,7 @@ VSHRQ_SCALAR_IMPL(int32_t, vshrd_n, s64) { \ return prefix_signed##_##postfix(a, b); \ } \ - \ + \ template <int b, typename T> \ inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \ vqrshrn_ex(const vtype &a) \ diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h index 03c2813a32..d24fdfa816 100644 --- a/src/core/NEON/wrapper/intrinsics/sin.h +++ b/src/core/NEON/wrapper/intrinsics/sin.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_SIN_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute @@ -54,4 +55,4 @@ VSIN_IMPL_INT(int32x4_t, vsinq, s32) #undef vsub_IMPL } // namespace wrapper } // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_WRAPPER_SUB_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svcnt.h b/src/core/NEON/wrapper/intrinsics/svcnt.h index e530e7c83f..c4652504b4 100644 --- a/src/core/NEON/wrapper/intrinsics/svcnt.h +++ b/src/core/NEON/wrapper/intrinsics/svcnt.h @@ -30,7 +30,7 @@ namespace arm_compute namespace wrapper { template <size_t element_size> -inline uint64_t svcnt_size(); +inline uint64_t svcnt_size(); template <> inline uint64_t svcnt_size<64>() @@ -65,4 +65,4 @@ inline uint64_t svcnt() } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svcvt.h b/src/core/NEON/wrapper/intrinsics/svcvt.h index 746b004d7d..00ef7b7eb3 100644 --- a/src/core/NEON/wrapper/intrinsics/svcvt.h +++ b/src/core/NEON/wrapper/intrinsics/svcvt.h @@ -29,11 +29,12 @@ namespace arm_compute { namespace wrapper { -#define SVCVT_Z_TO_F32_IMPL(vtype) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, const vtype &a) \ - { \ - return svcvt_f32_z(pg, a); \ +#define SVCVT_Z_TO_F32_IMPL(vtype) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, \ + const vtype &a) \ + { \ + return svcvt_f32_z(pg, a); \ } SVCVT_Z_TO_F32_IMPL(svuint32_t) @@ -42,11 +43,12 @@ SVCVT_Z_TO_F32_IMPL(svfloat16_t) #undef SVCVT_Z_TO_F32_IMPL -#define SVCVT_Z_TO_F16_IMPL(vtype) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, const vtype &a) \ - { \ - return svcvt_f16_z(pg, a); \ +#define SVCVT_Z_TO_F16_IMPL(vtype) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, \ + const vtype &a) \ + { \ + return svcvt_f16_z(pg, a); \ } SVCVT_Z_TO_F16_IMPL(svuint32_t) @@ -55,11 +57,12 @@ SVCVT_Z_TO_F16_IMPL(svfloat32_t) #undef SVCVT_Z_TO_F16_IMPL -#define SVCVT_Z_TO_S32_IMPL(vtype) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, const vtype &a) \ - { \ - return svcvt_s32_z(pg, a); \ +#define SVCVT_Z_TO_S32_IMPL(vtype) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, \ + const vtype &a) \ + { \ + return svcvt_s32_z(pg, a); \ } SVCVT_Z_TO_S32_IMPL(svfloat16_t) @@ -71,4 +74,4 @@ SVCVT_Z_TO_S32_IMPL(svfloat32_t) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svexp.h b/src/core/NEON/wrapper/intrinsics/svexp.h index d6ce9a77d1..1e8bce3960 100644 --- a/src/core/NEON/wrapper/intrinsics/svexp.h +++ b/src/core/NEON/wrapper/intrinsics/svexp.h @@ -26,6 +26,7 @@ #if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> namespace arm_compute @@ -46,4 +47,4 @@ SVEXP_IMPL(svfloat16_t, f16) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svlog.h b/src/core/NEON/wrapper/intrinsics/svlog.h index 5b505ae1e3..b4630e20ed 100644 --- a/src/core/NEON/wrapper/intrinsics/svlog.h +++ b/src/core/NEON/wrapper/intrinsics/svlog.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H #if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> namespace arm_compute @@ -44,4 +45,4 @@ SVLOG_IMPL(svfloat16_t, f16) } // namespace wrapper } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svptrue.h b/src/core/NEON/wrapper/intrinsics/svptrue.h index 53407e5301..6ed00bccbf 100644 --- a/src/core/NEON/wrapper/intrinsics/svptrue.h +++ b/src/core/NEON/wrapper/intrinsics/svptrue.h @@ -30,7 +30,7 @@ namespace arm_compute namespace wrapper { template <size_t element_size> -inline svbool_t svptrue_size(); +inline svbool_t svptrue_size(); template <> inline svbool_t svptrue_size<64>() @@ -65,4 +65,4 @@ svbool_t svptrue() } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svwhilelt.h b/src/core/NEON/wrapper/intrinsics/svwhilelt.h index ef58217dc4..f0f84a9508 100644 --- a/src/core/NEON/wrapper/intrinsics/svwhilelt.h +++ b/src/core/NEON/wrapper/intrinsics/svwhilelt.h @@ -32,7 +32,7 @@ namespace wrapper #define SVWHILELT_IMPL(type) \ template <size_t element_size> \ inline svbool_t svwhilelt_size(type a, type b); \ - \ + \ template <> \ inline svbool_t svwhilelt_size<64>(type a, type b) \ { \ @@ -70,4 +70,4 @@ inline svbool_t svwhilelt(IndexType a, IndexType b) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h index daeaf19997..e74f0e86fe 100644 --- a/src/core/NEON/wrapper/intrinsics/tanh.h +++ b/src/core/NEON/wrapper/intrinsics/tanh.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_TANH_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h index 642d9261f3..2ec88869e3 100644 --- a/src/core/NEON/wrapper/scalar/add.h +++ b/src/core/NEON/wrapper/scalar/add.h @@ -32,22 +32,22 @@ namespace wrapper { inline uint8_t add_sat(const uint8_t &a, const uint8_t &b) { - const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 }; - const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 }; + const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0}; + const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0}; return vget_lane_u8(vqadd_u8(va, vb), 0); } inline int16_t add_sat(const int16_t &a, const int16_t &b) { - const int16x4_t va = { a, 0, 0, 0 }; - const int16x4_t vb = { b, 0, 0, 0 }; + const int16x4_t va = {a, 0, 0, 0}; + const int16x4_t vb = {b, 0, 0, 0}; return vget_lane_s16(vqadd_s16(va, vb), 0); } inline int32_t add_sat(const int32_t &a, const int32_t &b) { - const int32x2_t va = { a, 0 }; - const int32x2_t vb = { b, 0 }; + const int32x2_t va = {a, 0}; + const int32x2_t vb = {b, 0}; return vget_lane_s32(vqadd_s32(va, vb), 0); } diff --git a/src/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h index 1fe51d75fc..00de7d867f 100644 --- a/src/core/NEON/wrapper/scalar/sub.h +++ b/src/core/NEON/wrapper/scalar/sub.h @@ -32,22 +32,22 @@ namespace wrapper { inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b) { - const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 }; - const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 }; + const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0}; + const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0}; return vget_lane_u8(vqsub_u8(va, vb), 0); } inline int16_t sub_sat(const int16_t &a, const int16_t &b) { - const int16x4_t va = { a, 0, 0, 0 }; - const int16x4_t vb = { b, 0, 0, 0 }; + const int16x4_t va = {a, 0, 0, 0}; + const int16x4_t vb = {b, 0, 0, 0}; return vget_lane_s16(vqsub_s16(va, vb), 0); } inline int32_t sub_sat(const int32_t &a, const int32_t &b) { - const int32x2_t va = { a, 0 }; - const int32x2_t vb = { b, 0 }; + const int32x2_t va = {a, 0}; + const int32x2_t vb = {b, 0}; return vget_lane_s32(vqsub_s32(va, vb), 0); } diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h index 5ccd0ba8f1..330d272752 100644 --- a/src/core/NEON/wrapper/svtraits.h +++ b/src/core/NEON/wrapper/svtraits.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H #if defined(ARM_COMPUTE_ENABLE_SVE) #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> namespace arm_compute diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp index 99858e2a98..62ce335815 100644 --- a/src/core/Rounding.cpp +++ b/src/core/Rounding.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Rounding.h" #include "arm_compute/core/Error.h" + #include "support/ToolchainSupport.h" #include <cmath> @@ -36,7 +37,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy) { using namespace std; int rounded = 0; - switch(rounding_policy) + switch (rounding_policy) { case RoundingPolicy::TO_ZERO: { @@ -51,9 +52,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy) case RoundingPolicy::TO_NEAREST_EVEN: { #ifdef __aarch64__ - asm("fcvtns %x[res], %s[value]" - : [res] "=r"(rounded) - : [value] "w"(x)); + asm("fcvtns %x[res], %s[value]" : [res] "=r"(rounded) : [value] "w"(x)); #else // __aarch64__ ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported."); #endif // __aarch64__ diff --git a/src/core/Size2D.cpp b/src/core/Size2D.cpp index 6eb46e56af..69b2651520 100644 --- a/src/core/Size2D.cpp +++ b/src/core/Size2D.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Size2D.h" + #include "support/StringSupport.h" namespace arm_compute @@ -30,4 +31,4 @@ std::string Size2D::to_string() const { return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height); } -} +} // namespace arm_compute diff --git a/src/core/Size3D.cpp b/src/core/Size3D.cpp index 3ee9fb8e5c..b56a99acd7 100644 --- a/src/core/Size3D.cpp +++ b/src/core/Size3D.cpp @@ -22,12 +22,14 @@ * SOFTWARE. */ #include "arm_compute/core/Size3D.h" + #include "support/StringSupport.h" namespace arm_compute { std::string Size3D::to_string() const { - return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") + support::cpp11::to_string(depth); + return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") + + support::cpp11::to_string(depth); } -}
\ No newline at end of file +} // namespace arm_compute diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp index 723b6bc016..8012c3d721 100644 --- a/src/core/SubTensorInfo.cpp +++ b/src/core/SubTensorInfo.cpp @@ -42,10 +42,10 @@ namespace TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords) { // Extend shape - for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i) + for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i) { int dimension_extend = coords[i] + static_cast<int>(shape[i]); - if((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0)) + if ((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0)) { parent_shape.set(i, static_cast<size_t>(dimension_extend)); } @@ -56,23 +56,35 @@ TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coo } // namespace SubTensorInfo::SubTensorInfo() - : _parent(nullptr), _tensor_shape(), _dims_state(), _coords(), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(false), _lock_paddings(false) + : _parent(nullptr), + _tensor_shape(), + _dims_state(), + _coords(), + _valid_region{Coordinates(), _tensor_shape}, + _extend_parent(false), + _lock_paddings(false) { } SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent) - : _parent(parent), _tensor_shape(tensor_shape), _dims_state(), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent), _lock_paddings(false) + : _parent(parent), + _tensor_shape(tensor_shape), + _dims_state(), + _coords(coords), + _valid_region{Coordinates(), _tensor_shape}, + _extend_parent(extend_parent), + _lock_paddings(false) { ARM_COMPUTE_ERROR_ON(parent == nullptr); // Check if subtensor is valid if parent is configured - if(parent->tensor_shape().total_size() != 0 && !_extend_parent) + if (parent->tensor_shape().total_size() != 0 && !_extend_parent) { ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape); } // Initialize valid region - _valid_region = ValidRegion{ Coordinates(), _tensor_shape }; + _valid_region = ValidRegion{Coordinates(), _tensor_shape}; } std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const @@ -91,17 +103,17 @@ ITensorInfo &SubTensorInfo::set_tensor_shape(const TensorShape &shape) ARM_COMPUTE_ERROR_ON(_parent == nullptr); // Check if subtensor is valid if parent is configured - if(_parent->tensor_shape().total_size() != 0 && !_extend_parent) + if (_parent->tensor_shape().total_size() != 0 && !_extend_parent) { ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape); - _valid_region = ValidRegion{ _coords, shape }; + _valid_region = ValidRegion{_coords, shape}; } - else if(_extend_parent) // Extend parent shape, configure if specified + else if (_extend_parent) // Extend parent shape, configure if specified { ARM_COMPUTE_ERROR_ON((_parent->data_type() == DataType::UNKNOWN) && (_parent->format() == Format::UNKNOWN)); TensorShape parent_extended_shape = extend_parent_shape(_parent->tensor_shape(), shape, _coords); _parent->set_tensor_shape(parent_extended_shape); - _parent->set_valid_region(ValidRegion{ Coordinates(), parent_extended_shape }); + _parent->set_valid_region(ValidRegion{Coordinates(), parent_extended_shape}); } _tensor_shape = shape; return *this; @@ -133,11 +145,11 @@ bool SubTensorInfo::extend_padding(const PaddingSize &padding) ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0); // Check that you do not extend padding on sub-tensors unless XY shape matches parent tensor - if(!_extend_parent && (padding.left || padding.right)) + if (!_extend_parent && (padding.left || padding.right)) { ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().x() != tensor_shape().x()); } - if(!_extend_parent && (padding.top || padding.bottom)) + if (!_extend_parent && (padding.top || padding.bottom)) { ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().y() != tensor_shape().y()); } @@ -153,7 +165,7 @@ int32_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const int32_t offset = offset_first_element_in_bytes(); const Strides &strides = strides_in_bytes(); - for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i) + for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i) { offset += pos[i] * strides[i]; } diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp index 5905ba5215..31bddbde40 100644 --- a/src/core/TensorInfo.cpp +++ b/src/core/TensorInfo.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/Utils.h" #include <memory> @@ -34,13 +35,26 @@ namespace arm_compute { TensorInfo::TensorInfo() - : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true }, - _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id), _lock_paddings(false) -{ -} - -TensorInfo::TensorInfo(const ITensorInfo &info) - : TensorInfo() + : _total_size(0), + _offset_first_element_in_bytes(0), + _strides_in_bytes(), + _num_channels(0), + _tensor_shape(), + _dims_state(), + _data_type(DataType::UNKNOWN), + _format(Format::UNKNOWN), + _is_resizable{true}, + _valid_region{Coordinates(), _tensor_shape}, + _padding{0}, + _quantization_info(), + _data_layout(DataLayout::NCHW), + _are_values_constant(true), + _id(invalid_tensor_id), + _lock_paddings(false) +{ +} + +TensorInfo::TensorInfo(const ITensorInfo &info) : TensorInfo() { _total_size = info.total_size(); _offset_first_element_in_bytes = info.offset_first_element_in_bytes(); @@ -60,8 +74,7 @@ TensorInfo::TensorInfo(const ITensorInfo &info) _lock_paddings = info.lock_paddings(); } -TensorInfo::TensorInfo(const TensorInfo &info) - : TensorInfo() +TensorInfo::TensorInfo(const TensorInfo &info) : TensorInfo() { _total_size = info.total_size(); _offset_first_element_in_bytes = info.offset_first_element_in_bytes(); @@ -80,8 +93,7 @@ TensorInfo::TensorInfo(const TensorInfo &info) _id = info.id(); _lock_paddings = false; } -TensorInfo::TensorInfo(Format format) - : TensorInfo(TensorShape(), format) +TensorInfo::TensorInfo(Format format) : TensorInfo(TensorShape(), format) { } @@ -90,25 +102,25 @@ TensorInfo::TensorInfo(unsigned int width, unsigned int height, Format format) { } -TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format) - : TensorInfo() +TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format) : TensorInfo() { init(tensor_shape, format); } -TensorInfo::TensorInfo(size_t num_channels, DataType data_type) - : TensorInfo() +TensorInfo::TensorInfo(size_t num_channels, DataType data_type) : TensorInfo() { init(TensorShape(), num_channels, data_type); } -TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type) - : TensorInfo() +TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type) : TensorInfo() { init(tensor_shape, num_channels, data_type); } -TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info) +TensorInfo::TensorInfo(const TensorShape &tensor_shape, + size_t num_channels, + DataType data_type, + QuantizationInfo quantization_info) : TensorInfo() { init(tensor_shape, num_channels, data_type); @@ -137,9 +149,11 @@ void TensorInfo::init(const TensorShape &tensor_shape, Format format) _format = format; } -void TensorInfo::init(const TensorShape &tensor_shape, Format format, - const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, - size_t total_size_in_bytes) +void TensorInfo::init(const TensorShape &tensor_shape, + Format format, + const Strides &strides_in_bytes, + size_t offset_first_element_in_bytes, + size_t total_size_in_bytes) { size_t num_channels = num_channels_from_format(format); const DataType type = data_type_from_format(format); @@ -165,9 +179,12 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data set_tensor_shape(tensor_shape); } -void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, - const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, - size_t total_size_in_bytes) +void TensorInfo::init(const TensorShape &tensor_shape, + size_t num_channels, + DataType data_type, + const Strides &strides_in_bytes, + size_t offset_first_element_in_bytes, + size_t total_size_in_bytes) { ARM_COMPUTE_ERROR_ON(num_channels == 0); @@ -179,7 +196,7 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data _strides_in_bytes = strides_in_bytes; _total_size = total_size_in_bytes; - _valid_region = ValidRegion{ Coordinates(), _tensor_shape }; + _valid_region = ValidRegion{Coordinates(), _tensor_shape}; } size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format) @@ -202,7 +219,7 @@ size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num _format = Format::UNKNOWN; _tensor_shape = tensor_shape; - _valid_region = ValidRegion{ Coordinates(), _tensor_shape }; + _valid_region = ValidRegion{Coordinates(), _tensor_shape}; auto_padding(); @@ -233,11 +250,11 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c size_t required_total_size = 0; const size_t required_offset_first_element = padding.left * stride_x + padding.top * stride_y; - switch(_tensor_shape.num_dimensions()) + switch (_tensor_shape.num_dimensions()) { case 0: { - if(_tensor_shape.total_size() > 0) + if (_tensor_shape.total_size() > 0) { required_strides = Strides(stride_x, stride_x); required_total_size = stride_z; @@ -258,7 +275,8 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1; - required_total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension]; + required_total_size = + static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension]; break; } } @@ -284,25 +302,25 @@ bool TensorInfo::extend_padding(const PaddingSize &padding) bool updated = false; - if(padding.top > _padding.top) + if (padding.top > _padding.top) { _padding.top = padding.top; updated = true; } - if(padding.right > _padding.right) + if (padding.right > _padding.right) { _padding.right = padding.right; updated = true; } - if(padding.bottom > _padding.bottom) + if (padding.bottom > _padding.bottom) { _padding.bottom = padding.bottom; updated = true; } - if(padding.left > _padding.left) + if (padding.left > _padding.left) { _padding.left = padding.left; updated = true; @@ -336,7 +354,7 @@ ITensorInfo &TensorInfo::set_format(Format format) { _format = format; - if(_data_type == DataType::UNKNOWN) + if (_data_type == DataType::UNKNOWN) { _num_channels = num_channels_from_format(format); _data_type = data_type_from_format(format); @@ -355,19 +373,19 @@ ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape) _offset_first_element_in_bytes = 0; _strides_in_bytes = compute_strides(*this); - if(_tensor_shape.num_dimensions() == 0) + if (_tensor_shape.num_dimensions() == 0) { _total_size = _strides_in_bytes[0]; } else { const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1; - _total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension]; + _total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension]; } std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding); - _valid_region = ValidRegion{ Coordinates(), _tensor_shape }; + _valid_region = ValidRegion{Coordinates(), _tensor_shape}; return *this; } @@ -392,9 +410,10 @@ ITensorInfo &TensorInfo::set_data_layout(const DataLayout &data_layout) ITensorInfo &TensorInfo::reset_padding() { _padding = PaddingSize(); - if(((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0) + if (((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0) { - std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding); + std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = + calculate_padding_requirements(_padding); } return *this; } @@ -405,7 +424,7 @@ int32_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const int32_t offset = _offset_first_element_in_bytes; - for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i) + for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i) { offset += pos[i] * _strides_in_bytes[i]; } diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index 1ca7adb3a8..90a7ac32c0 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -49,7 +49,7 @@ std::string read_file(const std::string &filename, bool binary) fs.exceptions(std::ifstream::failbit | std::ifstream::badbit); std::ios_base::openmode mode = std::ios::in; - if(binary) + if (binary) { mode |= std::ios::binary; } @@ -66,7 +66,7 @@ std::string read_file(const std::string &filename, bool binary) out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>()); #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::ifstream::failure &e) + catch (const std::ifstream::failure &e) { ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what()); } @@ -77,32 +77,28 @@ std::string read_file(const std::string &filename, bool binary) const std::string &string_from_channel(Channel channel) { - static std::map<Channel, const std::string> channels_map = - { - { Channel::UNKNOWN, "UNKNOWN" }, - { Channel::R, "R" }, - { Channel::G, "G" }, - { Channel::B, "B" }, - { Channel::A, "A" }, - { Channel::Y, "Y" }, - { Channel::U, "U" }, - { Channel::V, "V" }, - { Channel::C0, "C0" }, - { Channel::C1, "C1" }, - { Channel::C2, "C2" }, - { Channel::C3, "C3" } - }; + static std::map<Channel, const std::string> channels_map = {{Channel::UNKNOWN, "UNKNOWN"}, + {Channel::R, "R"}, + {Channel::G, "G"}, + {Channel::B, "B"}, + {Channel::A, "A"}, + {Channel::Y, "Y"}, + {Channel::U, "U"}, + {Channel::V, "V"}, + {Channel::C0, "C0"}, + {Channel::C1, "C1"}, + {Channel::C2, "C2"}, + {Channel::C3, "C3"}}; return channels_map[channel]; } const std::string &string_from_border_mode(BorderMode border_mode) { - static std::map<BorderMode, const std::string> border_mode_map = - { - { BorderMode::UNDEFINED, "UNDEFINED" }, - { BorderMode::CONSTANT, "CONSTANT" }, - { BorderMode::REPLICATE, "REPLICATE" }, + static std::map<BorderMode, const std::string> border_mode_map = { + {BorderMode::UNDEFINED, "UNDEFINED"}, + {BorderMode::CONSTANT, "CONSTANT"}, + {BorderMode::REPLICATE, "REPLICATE"}, }; return border_mode_map[border_mode]; @@ -110,11 +106,10 @@ const std::string &string_from_border_mode(BorderMode border_mode) const std::string &string_from_norm_type(NormType type) { - static std::map<NormType, const std::string> norm_type_map = - { - { NormType::IN_MAP_1D, "IN_MAP_1D" }, - { NormType::IN_MAP_2D, "IN_MAP_2D" }, - { NormType::CROSS_MAP, "CROSS_MAP" }, + static std::map<NormType, const std::string> norm_type_map = { + {NormType::IN_MAP_1D, "IN_MAP_1D"}, + {NormType::IN_MAP_2D, "IN_MAP_2D"}, + {NormType::CROSS_MAP, "CROSS_MAP"}, }; return norm_type_map[type]; @@ -122,11 +117,10 @@ const std::string &string_from_norm_type(NormType type) const std::string &string_from_pooling_type(PoolingType type) { - static std::map<PoolingType, const std::string> pool_type_map = - { - { PoolingType::MAX, "MAX" }, - { PoolingType::AVG, "AVG" }, - { PoolingType::L2, "L2" }, + static std::map<PoolingType, const std::string> pool_type_map = { + {PoolingType::MAX, "MAX"}, + {PoolingType::AVG, "AVG"}, + {PoolingType::L2, "L2"}, }; return pool_type_map[type]; @@ -134,38 +128,36 @@ const std::string &string_from_pooling_type(PoolingType type) bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info) { - if(info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0) + if (info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0) { return false; } const auto ps = info.pad_stride_info; - const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.pad_left(), ps.pad_right() }); - const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.pad_top(), ps.pad_bottom() }); + const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.pad_left(), ps.pad_right()}); + const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.pad_top(), ps.pad_bottom()}); return pool_le_padding_x || pool_le_padding_y; } bool is_pool_3d_region_entirely_outside_input(const Pooling3dLayerInfo &info) { - if(info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0) + if (info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0) { return false; } const auto ps = info.padding; - const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.left, ps.right }); - const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.top, ps.bottom }); - const auto pool_le_padding_z = info.pool_size.z() <= std::max({ ps.front, ps.back }); + const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.left, ps.right}); + const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.top, ps.bottom}); + const auto pool_le_padding_z = info.pool_size.z() <= std::max({ps.front, ps.back}); return pool_le_padding_x || pool_le_padding_y || pool_le_padding_z; } const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage) { - static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map = - { - { GEMMLowpOutputStageType::NONE, "" }, - { GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down" }, - { GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint" }, - { GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float" } - }; + static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map = { + {GEMMLowpOutputStageType::NONE, ""}, + {GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down"}, + {GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint"}, + {GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float"}}; return output_stage_map[output_stage]; } @@ -175,7 +167,7 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data std::stringstream ss; std::string converted_string; - switch(data_type) + switch (data_type) { case DataType::U8: case DataType::QASYMM8: @@ -223,11 +215,16 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data return converted_string; } -PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation, +PadStrideInfo calculate_same_pad(TensorShape input_shape, + TensorShape weights_shape, + PadStrideInfo conv_info, + DataLayout data_layout, + const Size2D &dilation, const DimensionRoundingType &rounding_type) { const auto &strides = conv_info.stride(); - ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), "Stride values should be greater than or equal to 1."); + ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), + "Stride values should be greater than or equal to 1."); const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); @@ -246,8 +243,9 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh const int real_weight_height = (kernel_height - 1) * dilation.y() + 1; // Calculate total pad - const int pad_width = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width)); - const int pad_height = std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height)); + const int pad_width = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width)); + const int pad_height = + std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height)); // Calculate individual paddings const unsigned int pad_left = pad_width / 2; @@ -265,8 +263,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh return same_info; } -std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height, - unsigned int kernel_width, unsigned int kernel_height, +std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, + unsigned int in_height, + unsigned int kernel_width, + unsigned int kernel_height, const PadStrideInfo &pad_stride_info) { const unsigned int pad_left = pad_stride_info.pad_left(); @@ -285,8 +285,10 @@ std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned i return std::make_pair<unsigned int, unsigned int>(w, h); } -std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height, - int kernel_width, int kernel_height, +std::pair<unsigned int, unsigned int> scaled_dimensions(int width, + int height, + int kernel_width, + int kernel_height, const PadStrideInfo &pad_stride_info, const Size2D &dilation) { @@ -300,15 +302,25 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height, const int stride_y = pad_stride_info.stride().second; int w = 0; int h = 0; - switch(pad_stride_info.round()) + switch (pad_stride_info.round()) { case DimensionRoundingType::FLOOR: - w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1)); - h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1)); + w = static_cast<int>(std::floor( + (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + + 1)); + h = static_cast<int>( + std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / + stride_y) + + 1)); break; case DimensionRoundingType::CEIL: - w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1)); - h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1)); + w = static_cast<int>(std::ceil( + (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + + 1)); + h = static_cast<int>( + std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / + stride_y) + + 1)); break; default: ARM_COMPUTE_ERROR("Unsupported rounding type"); @@ -319,9 +331,8 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height, return std::make_pair(static_cast<unsigned int>(w), static_cast<unsigned int>(h)); } -std::pair<int, int> scaled_dimensions_signed(int width, int height, - int kernel_width, int kernel_height, - const PadStrideInfo &pad_stride_info) +std::pair<int, int> scaled_dimensions_signed( + int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info) { const int pad_left = pad_stride_info.pad_left(); const int pad_top = pad_stride_info.pad_top(); @@ -331,15 +342,19 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height, const int stride_y = pad_stride_info.stride().second; int w = 0; int h = 0; - switch(pad_stride_info.round()) + switch (pad_stride_info.round()) { case DimensionRoundingType::FLOOR: - w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); - h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); + w = static_cast<int>( + std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); + h = static_cast<int>( + std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); break; case DimensionRoundingType::CEIL: - w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); - h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); + w = static_cast<int>( + std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); + h = static_cast<int>( + std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); break; default: ARM_COMPUTE_ERROR("Unsupported rounding type"); @@ -348,8 +363,12 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height, return std::make_pair(static_cast<int>(w), static_cast<int>(h)); } -std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int depth, - int kernel_width, int kernel_height, int kernel_depth, +std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, + int height, + int depth, + int kernel_width, + int kernel_height, + int kernel_depth, const Pooling3dLayerInfo &pool3d_info) { const int pad_left = pool3d_info.padding.left; @@ -365,17 +384,23 @@ std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int int h = 0; int d = 0; - switch(pool3d_info.round_type) + switch (pool3d_info.round_type) { case DimensionRoundingType::FLOOR: - w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); - h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); - d = static_cast<int>(std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1)); + w = static_cast<int>( + std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); + h = static_cast<int>( + std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); + d = static_cast<int>( + std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1)); break; case DimensionRoundingType::CEIL: - w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); - h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); - d = static_cast<int>(std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1)); + w = static_cast<int>( + std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1)); + h = static_cast<int>( + std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1)); + d = static_cast<int>( + std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1)); break; default: ARM_COMPUTE_ERROR("Unsupported rounding type"); @@ -400,9 +425,9 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool // * Softmax with QASYMM8_SIGNED: scale = 1/256, offset = -128 // * LogSoftmax with QASYMM8: scale = 1/256, offset = 0 // * LogSoftmax with QASYMM8_SIGNED: scale = 16/256, offset = 127 - if(is_data_type_quantized_asymmetric_signed(input_type)) + if (is_data_type_quantized_asymmetric_signed(input_type)) { - if(is_log) + if (is_log) { return QuantizationInfo(16.f / 256, 127); } @@ -414,17 +439,21 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool return QuantizationInfo(1.f / 256, 0); } -std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info) +std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, + DataType data_type, + UniformQuantizationInfo oq_info) { const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type); const auto a = act_info.a(); const auto b = act_info.b(); - const int a_int = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info); - const int b_int = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info); - const auto type_max_value = std::get<1>(get_min_max(data_type)).get<int32_t>(); + const int a_int = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info); + const int b_int = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info); + const auto type_max_value = std::get<1>(get_min_max(data_type)).get<int32_t>(); - const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int; - const int32_t max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int; + const int32_t min_activation = + act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int; + const int32_t max_activation = + act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int; return std::make_pair(min_activation, max_activation); } @@ -433,11 +462,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi { std::unordered_map<const ITensorInfo *, PaddingSize> res; - for(const ITensor *tensor : tensors) + for (const ITensor *tensor : tensors) { - if(tensor) + if (tensor) { - res.insert({ tensor->info(), tensor->info()->padding() }); + res.insert({tensor->info(), tensor->info()->padding()}); } } @@ -448,11 +477,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi { std::unordered_map<const ITensorInfo *, PaddingSize> res; - for(const ITensorInfo *info : infos) + for (const ITensorInfo *info : infos) { - if(info) + if (info) { - res.insert({ info, info->padding() }); + res.insert({info, info->padding()}); } } @@ -461,17 +490,20 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map) { - return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info) - { - return (padding_info.first->padding() != padding_info.second); - }) - != padding_map.end(); + return std::find_if(padding_map.begin(), padding_map.end(), + [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info) + { return (padding_info.first->padding() != padding_info.second); }) != padding_map.end(); } #ifdef ARM_COMPUTE_ASSERTS_ENABLED -void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim) +void print_consecutive_elements(std::ostream &s, + DataType dt, + const uint8_t *ptr, + unsigned int n, + int stream_width, + const std::string &element_delim) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: @@ -481,36 +513,46 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr case DataType::QSYMM8: case DataType::QASYMM8_SIGNED: case DataType::QSYMM8_PER_CHANNEL: - print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, + element_delim); break; case DataType::U16: case DataType::QASYMM16: - print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, + element_delim); break; case DataType::S16: case DataType::QSYMM16: - print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, + element_delim); break; case DataType::U32: - print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width, + element_delim); break; case DataType::S32: - print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, + element_delim); break; case DataType::U64: - print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width, + element_delim); break; case DataType::S64: - print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width, + element_delim); break; case DataType::BFLOAT16: - print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width, + element_delim); break; case DataType::F16: - print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width, + element_delim); break; case DataType::F32: - print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, element_delim); + print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, + element_delim); break; default: ARM_COMPUTE_ERROR("Undefined element size for given data type"); @@ -519,7 +561,7 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n) { - switch(dt) + switch (dt) { case DataType::U8: case DataType::QASYMM8: diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp index 5a6486e11e..d8f796193e 100644 --- a/src/core/Validate.cpp +++ b/src/core/Validate.cpp @@ -23,13 +23,16 @@ */ #include "arm_compute/core/Validate.h" -arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line, - const arm_compute::Window &full, const arm_compute::Window &win) +arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, + const char *file, + const int line, + const arm_compute::Window &full, + const arm_compute::Window &win) { full.validate(); win.validate(); - for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i) + for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line); ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line); @@ -38,13 +41,16 @@ arm_compute::Status arm_compute::error_on_mismatching_windows(const char *functi return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line, - const arm_compute::Window &full, const arm_compute::Window &sub) +arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, + const char *file, + const int line, + const arm_compute::Window &full, + const arm_compute::Window &sub) { full.validate(); sub.validate(); - for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i) + for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line); ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line); @@ -54,8 +60,12 @@ arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line, - const arm_compute::Window &full, const arm_compute::Window &window, const int dim) +arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, + const char *file, + const int line, + const arm_compute::Window &full, + const arm_compute::Window &window, + const int dim) { full.validate(); window.validate(); @@ -67,65 +77,73 @@ arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(co return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line, - const arm_compute::Coordinates &pos, unsigned int max_dim) +arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte( + const char *function, const char *file, const int line, const arm_compute::Coordinates &pos, unsigned int max_dim) { - for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i) + for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(pos[i] != 0, function, file, line); } return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line, - const arm_compute::Window &win, unsigned int max_dim) +arm_compute::Status arm_compute::error_on_window_dimensions_gte( + const char *function, const char *file, const int line, const arm_compute::Window &win, unsigned int max_dim) { - for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i) + for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i) { - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR((win[i].start() != 0) || (win[i].end() != win[i].step()), - function, file, line, - "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR( + (win[i].start() != 0) || (win[i].end() != win[i].step()), function, file, line, + "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i); } return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line, +arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, + const char *file, + const int line, const arm_compute::ITensor *tensor) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor->info() == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2, - function, file, line, - "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->info()->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2, function, file, line, + "Only 2D Tensors are supported by this kernel (%zu passed)", + tensor->info()->num_dimensions()); return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line, +arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, + const char *file, + const int line, const arm_compute::ITensorInfo *tensor) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2, - function, file, line, - "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2, function, file, line, + "Only 2D Tensors are supported by this kernel (%zu passed)", + tensor->num_dimensions()); return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line, - arm_compute::Format fmt, arm_compute::Channel cn) +arm_compute::Status arm_compute::error_on_channel_not_in_known_format( + const char *function, const char *file, const int line, arm_compute::Format fmt, arm_compute::Channel cn) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line); ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line); - switch(fmt) + switch (fmt) { case arm_compute::Format::RGB888: - arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B); + arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, + arm_compute::Channel::G, arm_compute::Channel::B); break; case arm_compute::Format::RGBA8888: - arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B, arm_compute::Channel::A); + arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, + arm_compute::Channel::G, arm_compute::Channel::B, + arm_compute::Channel::A); break; case arm_compute::Format::UV88: - arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, arm_compute::Channel::V); + arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, + arm_compute::Channel::V); break; case arm_compute::Format::IYUV: case arm_compute::Format::UYVY422: @@ -133,7 +151,8 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char case arm_compute::Format::NV12: case arm_compute::Format::NV21: case arm_compute::Format::YUV444: - arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, arm_compute::Channel::U, arm_compute::Channel::V); + arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, + arm_compute::Channel::U, arm_compute::Channel::V); break; default: ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format."); @@ -141,21 +160,26 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line, +arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, + const char *file, + const int line, const arm_compute::IKernel *kernel) { ARM_COMPUTE_RETURN_ERROR_ON_LOC(kernel == nullptr, function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(), - function, file, line, + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(), function, file, line, "This kernel hasn't been configured."); return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line, - const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape) +arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, + const char *file, + const int line, + const TensorShape &parent_shape, + const Coordinates &coords, + const TensorShape &shape) { // Check dimensions - for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i) + for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i) { const bool invalid_idx = coords[i] >= static_cast<int>(parent_shape[i]); const bool out_of_bounds_size = coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]); @@ -164,15 +188,20 @@ arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function return arm_compute::Status{}; } -arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line, - const ValidRegion &parent_valid_region, const ValidRegion &valid_region) +arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, + const char *file, + const int line, + const ValidRegion &parent_valid_region, + const ValidRegion &valid_region) { // Check valid regions - for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d) + for (unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d) { ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line); - ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])), - function, file, line); + ARM_COMPUTE_RETURN_ERROR_ON_LOC( + (parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < + (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])), + function, file, line); } return arm_compute::Status{}; diff --git a/src/core/common/Macros.h b/src/core/common/Macros.h index d791154e5c..bc0ea29911 100644 --- a/src/core/common/Macros.h +++ b/src/core/common/Macros.h @@ -25,9 +25,9 @@ #define ARM_COMPUTE_COMMON_MACROS_H #define ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(TypeName) \ - TypeName(const TypeName &) = delete; \ + TypeName(const TypeName &) = delete; \ TypeName &operator=(const TypeName &) = delete; \ TypeName(TypeName &&) = default; \ - TypeName &operator=(TypeName &&) = default + TypeName &operator=(TypeName &&) = default #endif /* ARM_COMPUTE_COMMON_MACROS_H */ diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h index d6dc3449fc..686304b8d7 100644 --- a/src/core/common/Registrars.h +++ b/src/core/common/Registrars.h @@ -46,7 +46,7 @@ #else /* !defined(ENABLE_FP16_KERNELS) */ #define REGISTER_FP16_NEON(func_name) nullptr -#define REGISTER_FP16_SVE(func_name) nullptr +#define REGISTER_FP16_SVE(func_name) nullptr #define REGISTER_FP16_SVE2(func_name) nullptr #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ @@ -72,7 +72,7 @@ #else /* defined(ENABLE_FP32_KERNELS) */ #define REGISTER_FP32_NEON(func_name) nullptr -#define REGISTER_FP32_SVE(func_name) nullptr +#define REGISTER_FP32_SVE(func_name) nullptr #define REGISTER_FP32_SVE2(func_name) nullptr #endif /* defined(ENABLE_FP32_KERNELS) */ @@ -94,7 +94,7 @@ #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr -#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr +#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr #define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ @@ -115,7 +115,7 @@ #else /* defined(ENABLE_QASYMM8_KERNELS) */ #define REGISTER_QASYMM8_NEON(func_name) nullptr -#define REGISTER_QASYMM8_SVE(func_name) nullptr +#define REGISTER_QASYMM8_SVE(func_name) nullptr #define REGISTER_QASYMM8_SVE2(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_KERNELS) */ @@ -137,7 +137,7 @@ #else /* defined(ENABLE_QSYMM16_KERNELS) */ #define REGISTER_QSYMM16_NEON(func_name) nullptr -#define REGISTER_QSYMM16_SVE(func_name) nullptr +#define REGISTER_QSYMM16_SVE(func_name) nullptr #define REGISTER_QSYMM16_SVE2(func_name) nullptr #endif /* defined(ENABLE_QSYMM16_KERNELS) */ @@ -169,7 +169,7 @@ #else /* defined(ENABLE_INTEGER_KERNELS) */ #define REGISTER_INTEGER_NEON(func_name) nullptr -#define REGISTER_INTEGER_SVE(func_name) nullptr +#define REGISTER_INTEGER_SVE(func_name) nullptr #define REGISTER_INTEGER_SVE2(func_name) nullptr #endif /* defined(ENABLE_INTEGER_KERNELS) */ diff --git a/src/core/helpers/AutoConfiguration.h b/src/core/helpers/AutoConfiguration.h index 8715dcd74b..9df2a76983 100644 --- a/src/core/helpers/AutoConfiguration.h +++ b/src/core/helpers/AutoConfiguration.h @@ -24,9 +24,9 @@ #ifndef SRC_CORE_HELPERS_AUTOCONFIGURATION_H #define SRC_CORE_HELPERS_AUTOCONFIGURATION_H -#include "arm_compute/core/utils/DataTypeUtils.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/DataTypeUtils.h" namespace arm_compute { @@ -42,10 +42,11 @@ namespace arm_compute */ inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, - int num_channels, DataType data_type, - QuantizationInfo quantization_info = QuantizationInfo()) + int num_channels, + DataType data_type, + QuantizationInfo quantization_info = QuantizationInfo()) { - if(info.tensor_shape().total_size() == 0) + if (info.tensor_shape().total_size() == 0) { info.set_data_type(data_type); info.set_num_channels(num_channels); @@ -70,7 +71,7 @@ inline bool auto_init_if_empty(ITensorInfo &info, */ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source) { - if(info_sink.tensor_shape().total_size() == 0) + if (info_sink.tensor_shape().total_size() == 0) { info_sink.set_data_type(info_source.data_type()); info_sink.set_num_channels(info_source.num_channels()); @@ -93,7 +94,7 @@ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_s */ inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape) { - if(info.tensor_shape().total_size() == 0) + if (info.tensor_shape().total_size() == 0) { info.set_tensor_shape(shape); return true; @@ -112,7 +113,7 @@ inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape) */ inline bool set_format_if_unknown(ITensorInfo &info, Format format) { - if(info.data_type() == DataType::UNKNOWN) + if (info.data_type() == DataType::UNKNOWN) { info.set_format(format); return true; @@ -131,7 +132,7 @@ inline bool set_format_if_unknown(ITensorInfo &info, Format format) */ inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type) { - if(info.data_type() == DataType::UNKNOWN) + if (info.data_type() == DataType::UNKNOWN) { info.set_data_type(data_type); return true; @@ -150,7 +151,7 @@ inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type) */ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout) { - if(info.data_layout() == DataLayout::UNKNOWN) + if (info.data_layout() == DataLayout::UNKNOWN) { info.set_data_layout(data_layout); return true; @@ -169,7 +170,7 @@ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout */ inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info) { - if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type()))) + if (info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type()))) { info.set_quantization_info(quantization_info); return true; diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h index a41052687b..dd094b414c 100644 --- a/src/core/helpers/MemoryHelpers.h +++ b/src/core/helpers/MemoryHelpers.h @@ -24,9 +24,9 @@ #ifndef SRC_COMMON_MEMORY_HELPERS_H #define SRC_COMMON_MEMORY_HELPERS_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/MemoryGroup.h" #include <memory> @@ -43,18 +43,17 @@ inline int offset_int_vec(int offset) template <typename TensorType> struct WorkspaceDataElement { - int slot{ -1 }; - experimental::MemoryLifetime lifetime{ experimental::MemoryLifetime::Temporary }; - std::unique_ptr<TensorType> tensor{ nullptr }; + int slot{-1}; + experimental::MemoryLifetime lifetime{experimental::MemoryLifetime::Temporary}; + std::unique_ptr<TensorType> tensor{nullptr}; }; template <typename TensorType> using WorkspaceData = std::vector<WorkspaceDataElement<TensorType>>; template <typename TensorType> -WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs, - MemoryGroup &mgroup, - ITensorPack &run_pack) +WorkspaceData<TensorType> +manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, ITensorPack &run_pack) { ITensorPack dummy_pack = ITensorPack(); return manage_workspace<TensorType>(mem_reqs, mgroup, run_pack, dummy_pack); @@ -63,24 +62,26 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement template <typename TensorType> WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, - ITensorPack &run_pack, ITensorPack &prep_pack) + ITensorPack &run_pack, + ITensorPack &prep_pack) { WorkspaceData<TensorType> workspace_memory; - for(const auto &req : mem_reqs) + for (const auto &req : mem_reqs) { - if(req.size == 0) + if (req.size == 0) { continue; } - const auto aux_info = TensorInfo{ TensorShape(req.size), 1, DataType::U8 }; - workspace_memory.emplace_back(WorkspaceDataElement<TensorType> { req.slot, req.lifetime, std::make_unique<TensorType>() }); + const auto aux_info = TensorInfo{TensorShape(req.size), 1, DataType::U8}; + workspace_memory.emplace_back( + WorkspaceDataElement<TensorType>{req.slot, req.lifetime, std::make_unique<TensorType>()}); auto aux_tensor = workspace_memory.back().tensor.get(); ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor); aux_tensor->allocator()->init(aux_info, req.alignment); - if(req.lifetime == experimental::MemoryLifetime::Temporary) + if (req.lifetime == experimental::MemoryLifetime::Temporary) { mgroup.manage(aux_tensor); } @@ -91,7 +92,7 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement run_pack.add_tensor(req.slot, aux_tensor); } - for(auto &mem : workspace_memory) + for (auto &mem : workspace_memory) { auto tensor = mem.tensor.get(); tensor->allocator()->allocate(); @@ -103,31 +104,29 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement template <typename TensorType> void release_prepare_tensors(WorkspaceData<TensorType> &workspace, ITensorPack &prep_pack) { - workspace.erase(std::remove_if(workspace.begin(), - workspace.end(), - [&prep_pack](auto & wk) - { - const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare; - if(to_erase) - { - prep_pack.remove_tensor(wk.slot); - } - return to_erase; - }), - workspace.end()); + workspace.erase(std::remove_if(workspace.begin(), workspace.end(), + [&prep_pack](auto &wk) + { + const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare; + if (to_erase) + { + prep_pack.remove_tensor(wk.slot); + } + return to_erase; + }), + workspace.end()); } /** Utility function to release tensors with lifetime marked as Prepare */ template <typename TensorType> -void release_temporaries(const experimental::MemoryRequirements &mem_reqs, - WorkspaceData<TensorType> &workspace) +void release_temporaries(const experimental::MemoryRequirements &mem_reqs, WorkspaceData<TensorType> &workspace) { - for(auto &ws : workspace) + for (auto &ws : workspace) { const int slot = ws.slot; - for(auto &m : mem_reqs) + for (auto &m : mem_reqs) { - if(m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare) + if (m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare) { auto tensor = ws.tensor.get(); tensor->allocator()->free(); diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h index 079629ee6a..9ef045f472 100644 --- a/src/core/helpers/PoolingHelpers.h +++ b/src/core/helpers/PoolingHelpers.h @@ -33,8 +33,20 @@ namespace cpu namespace { -inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w, - const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z) +inline float calculate_avg_scale_pool3d(bool exclude_padding, + const Coordinates &id, + const int pool_size_x, + const int pool_size_y, + const int pool_size_z, + const int upper_bound_w, + const int upper_bound_h, + const int upper_bound_d, + const int pad_x, + const int pad_y, + const int pad_z, + const int stride_x, + const int stride_y, + const int stride_z) { // Based on NDHWC int start_x = id[1] * stride_x - pad_x; @@ -44,7 +56,7 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates const int end_x = std::min(start_x + pool_size_x, upper_bound_w); const int end_y = std::min(start_y + pool_size_y, upper_bound_h); const int end_z = std::min(start_z + pool_size_z, upper_bound_d); - if(exclude_padding) + if (exclude_padding) { start_x = std::max(0, start_x); start_y = std::max(0, start_y); @@ -53,8 +65,17 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z)); } -inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) +inline float calculate_avg_scale_pool2d(bool exclude_padding, + DataLayout data_layout, + const Coordinates &id, + const int pool_size_x, + const int pool_size_y, + const int upper_bound_w, + const int upper_bound_h, + const int pad_x, + const int pad_y, + const int stride_x, + const int stride_y) { const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); @@ -64,7 +85,7 @@ inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_la const int end_x = std::min(start_x + pool_size_x, upper_bound_w); const int end_y = std::min(start_y + pool_size_y, upper_bound_h); - if(exclude_padding) + if (exclude_padding) { start_x = std::max(0, start_x); start_y = std::max(0, start_y); @@ -117,17 +138,26 @@ inline float32x4_t vcvtq_f32_q32(int32x4_t values) } template <typename Tout> -inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset); +inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, + const float quant_rescale, + const float scale_pooling, + const int32_t new_offset); template <> -inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) +inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, + const float quant_rescale, + const float scale_pooling, + const int32_t new_offset) { const float new_scale = quant_rescale / scale_pooling; return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset)); } template <> -inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) +inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, + const float quant_rescale, + const float scale_pooling, + const int32_t new_offset) { const float new_scale = quant_rescale / scale_pooling; return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset)); @@ -139,30 +169,24 @@ inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInf template <> inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) { - const float32x4x4_t acc = - { - { - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))), - } - }; + const float32x4x4_t acc = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))), + }}; return vquantize(acc, requant_qinfo); } template <> inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) { - const float32x4x4_t acc = - { - { - vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))), - } - }; + const float32x4x4_t acc = {{ + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))), + }}; return vquantize_signed(acc, requant_qinfo); } @@ -172,26 +196,20 @@ inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinf template <> inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) { - const float32x4x2_t acc = - { - { - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))), - } - }; + const float32x4x2_t acc = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))), + }}; return vquantize(acc, requant_qinfo); } template <> inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) { - const float32x4x2_t acc = - { - { - vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))), - } - }; + const float32x4x2_t acc = {{ + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))), + }}; return vquantize_signed(acc, requant_qinfo); } @@ -199,4 +217,3 @@ inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo } // namespace cpu } // namespace arm_compute #endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */ - diff --git a/src/core/helpers/ScaleHelpers.h b/src/core/helpers/ScaleHelpers.h index e769bba782..47605e7385 100644 --- a/src/core/helpers/ScaleHelpers.h +++ b/src/core/helpers/ScaleHelpers.h @@ -50,8 +50,12 @@ namespace scale_helpers * * @return The bilinear interpolated pixel value */ -inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy, - UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info) +inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, + size_t stride, + float dx, + float dy, + UniformQuantizationInfo iq_info, + UniformQuantizationInfo oq_info) { ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr); @@ -85,8 +89,12 @@ inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stri * * @return The bilinear interpolated pixel value */ -inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy, - UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info) +inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, + size_t stride, + float dx, + float dy, + UniformQuantizationInfo iq_info, + UniformQuantizationInfo oq_info) { ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr); @@ -122,9 +130,8 @@ inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride * * @return The pixel at (x, y) using area interpolation. */ -inline uint8_t -pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, - float hr, int x, int y) +inline uint8_t pixel_area_c1u8_clamp( + const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y) { ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr); @@ -159,7 +166,7 @@ pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t widt // Sum pixels in area int sum = 0; - for(int j = yi + y_from, je = yi + y_to; j <= je; ++j) + for (int j = yi + y_from, je = yi + y_to; j <= je; ++j) { const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from; sum = std::accumulate(ptr, ptr + x_elements, sum); diff --git a/src/core/helpers/SoftmaxHelpers.cpp b/src/core/helpers/SoftmaxHelpers.cpp index 71b971af31..8184991ab5 100644 --- a/src/core/helpers/SoftmaxHelpers.cpp +++ b/src/core/helpers/SoftmaxHelpers.cpp @@ -29,7 +29,7 @@ namespace softmax_helpers { PermutationVector get_permutation_vector_from_softmax_axis(size_t axis) { - switch(axis) + switch (axis) { case 1: return PermutationVector(1U, 0U, 2U, 3U); diff --git a/src/core/helpers/Utils.cpp b/src/core/helpers/Utils.cpp index 3900475355..6ca29d180d 100644 --- a/src/core/helpers/Utils.cpp +++ b/src/core/helpers/Utils.cpp @@ -31,9 +31,9 @@ bool has_holes(const ITensorInfo &info, size_t dimension) const auto &strides = info.strides_in_bytes(); size_t squashed_bytes = info.element_size(); - for(size_t dim = 0; dim <= dimension; ++dim) + for (size_t dim = 0; dim <= dimension; ++dim) { - if(strides[dim] != squashed_bytes) + if (strides[dim] != squashed_bytes) { return true; } diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h index 7ad960bfa2..2e7224c55b 100644 --- a/src/core/helpers/Utils.h +++ b/src/core/helpers/Utils.h @@ -45,7 +45,7 @@ inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixe // Create strides object Strides strides(stride_x, fixed_strides...); - for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i) + for (size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i) { strides.set(i, shape[i - 1] * strides[i - 1]); } diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp index a4d46db352..30a55fcbc6 100644 --- a/src/core/helpers/WindowHelpers.cpp +++ b/src/core/helpers/WindowHelpers.cpp @@ -25,9 +25,10 @@ namespace arm_compute { -Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size) +Window +calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size) { - if(!skip_border) + if (!skip_border) { border_size = BorderSize(0); } @@ -38,40 +39,47 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, Window window; window.set(0, Window::Dimension( - // Skip the border left of the image - anchor[0] + border_size.left, - // Skip the border right of the image - // Make sure the window width is a multiple of the step size - anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]), - steps[0])); + // Skip the border left of the image + anchor[0] + border_size.left, + // Skip the border right of the image + // Make sure the window width is a multiple of the step size + anchor[0] + border_size.left + + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - + static_cast<int>(border_size.right)), + steps[0]), + steps[0])); size_t n = 1; - if(anchor.num_dimensions() > 1) + if (anchor.num_dimensions() > 1) { - window.set(1, Window::Dimension( + window.set(1, + Window::Dimension( // Skip the border above the image anchor[1] + border_size.top, // Skip the border below the image - anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]), + anchor[1] + border_size.top + + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - + static_cast<int>(border_size.bottom)), + steps[1]), steps[1])); ++n; } - if(anchor.num_dimensions() > 2) + if (anchor.num_dimensions() > 2) { window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2])); ++n; } - for(; n < anchor.num_dimensions(); ++n) + for (; n < anchor.num_dimensions(); ++n) { window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n]))); } - for(; n < Coordinates::num_max_dimensions; ++n) + for (; n < Coordinates::num_max_dimensions; ++n) { window.set(n, Window::Dimension(0, 1)); } @@ -81,7 +89,7 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool skip_border, BorderSize border_size) { - if(!skip_border) + if (!skip_border) { border_size = BorderSize(0); } @@ -89,40 +97,46 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool s Window window; window.set(0, Window::Dimension( - // Skip the border left of the image - border_size.left, - // Skip the border right of the image - // Make sure the window width is a multiple of the step size - border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]), - steps[0])); + // Skip the border left of the image + border_size.left, + // Skip the border right of the image + // Make sure the window width is a multiple of the step size + border_size.left + + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - + static_cast<int>(border_size.right)), + steps[0]), + steps[0])); size_t n = 1; - if(shape.num_dimensions() > 1) + if (shape.num_dimensions() > 1) { window.set(1, Window::Dimension( - // Skip the border above the image - border_size.top, - // Skip the border below the image - border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]), - steps[1])); + // Skip the border above the image + border_size.top, + // Skip the border below the image + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - + static_cast<int>(border_size.top) - + static_cast<int>(border_size.bottom)), + steps[1]), + steps[1])); ++n; } - if(shape.num_dimensions() > 2) + if (shape.num_dimensions() > 2) { window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[2]), steps[2])); ++n; } - for(; n < shape.num_dimensions(); ++n) + for (; n < shape.num_dimensions(); ++n) { window.set(n, Window::Dimension(0, std::max<size_t>(1, shape[n]))); } - for(; n < Coordinates::num_max_dimensions; ++n) + for (; n < Coordinates::num_max_dimensions; ++n) { window.set(n, Window::Dimension(0, 1)); } @@ -138,40 +152,42 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step Window window; window.set(0, Window::Dimension( - // move the anchor to the start from the border - anchor[0] - border_size.left, - // move the anchor to include the right end border - // Make sure the window width is a multiple of the step size - anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]), - steps[0])); + // move the anchor to the start from the border + anchor[0] - border_size.left, + // move the anchor to include the right end border + // Make sure the window width is a multiple of the step size + anchor[0] - border_size.left + + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]), + steps[0])); size_t n = 1; - if(anchor.num_dimensions() > 1) + if (anchor.num_dimensions() > 1) { window.set(1, Window::Dimension( - // Include the border above the image - anchor[1] - border_size.top, - // Include the border below the image - anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]), - steps[1])); + // Include the border above the image + anchor[1] - border_size.top, + // Include the border below the image + anchor[1] - border_size.top + + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]), + steps[1])); ++n; } - if(anchor.num_dimensions() > 2) + if (anchor.num_dimensions() > 2) { window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2])); ++n; } - for(; n < anchor.num_dimensions(); ++n) + for (; n < anchor.num_dimensions(); ++n) { window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n]))); } - for(; n < Coordinates::num_max_dimensions; ++n) + for (; n < Coordinates::num_max_dimensions; ++n) { window.set(n, Window::Dimension(0, 1)); } @@ -179,9 +195,12 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step return window; } -Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size) +Window calculate_max_window_horizontal(const ValidRegion &valid_region, + const Steps &steps, + bool skip_border, + BorderSize border_size) { - if(skip_border) + if (skip_border) { border_size.top = 0; border_size.bottom = 0; @@ -198,33 +217,35 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St Window window; window.set(0, Window::Dimension( - // Skip the border left of the image - anchor[0] + border_size.left, - // Skip the border right of the image - // Make sure the window width is a multiple of the step size - anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]), - steps[0])); + // Skip the border left of the image + anchor[0] + border_size.left, + // Skip the border right of the image + // Make sure the window width is a multiple of the step size + anchor[0] + border_size.left + + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - + static_cast<int>(border_size.right)), + steps[0]), + steps[0])); size_t n = 1; - if(anchor.num_dimensions() > 1) + if (anchor.num_dimensions() > 1) { window.set(1, Window::Dimension( - // Skip the border above the image - anchor[1] - border_size.top, - // Skip the border below the image - anchor[1] + shape[1] + border_size.bottom, - 1)); + // Skip the border above the image + anchor[1] - border_size.top, + // Skip the border below the image + anchor[1] + shape[1] + border_size.bottom, 1)); ++n; } - for(; n < anchor.num_dimensions(); ++n) + for (; n < anchor.num_dimensions(); ++n) { window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n]))); } - for(; n < Coordinates::num_max_dimensions; ++n) + for (; n < Coordinates::num_max_dimensions; ++n) { window.set(n, Window::Dimension(0, 1)); } @@ -247,9 +268,9 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr size_t squashed_bytes = src0.element_size(); // Try to squash the low dimensions together. - for(; dim < num_dimensions; ++dim) + for (; dim < num_dimensions; ++dim) { - if(shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes) + if (shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes) { break; } @@ -257,7 +278,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr squashed_bytes *= shape0[dim]; } - if(dim == num_dimensions) + if (dim == num_dimensions) { auto squashed_elements = squashed_bytes / src0.element_size(); @@ -266,7 +287,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr // The input tensors can be interpreted as 1D array. win.set(0, Window::Dimension(0, squashed_elements, 1)); - for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim) + for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim) { win.set(dim, Window::Dimension(0, 1, 1)); } @@ -274,7 +295,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr else { // Generates the max window. - for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim) + for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim) { win.set(dim, Window::Dimension(0, std::max(shape0[dim], shape1[dim]), 1)); } @@ -295,21 +316,21 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr size_t squashed_bytes = src.element_size(); // Try to squash the low dimensions together. - for(; dim < num_dimensions; ++dim) + for (; dim < num_dimensions; ++dim) { - if(strides[dim] != squashed_bytes) + if (strides[dim] != squashed_bytes) { break; } squashed_bytes *= shape[dim]; } - if(dim == num_dimensions) + if (dim == num_dimensions) { const auto squashed_elements = squashed_bytes / src.element_size(); split_dimension = Window::DimX; // The input tensor can be interpreted as 1D array. win.set(0, Window::Dimension(0, squashed_elements, 1)); - for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim) + for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim) { win.set(dim, Window::Dimension(0, 1, 1)); } @@ -317,7 +338,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr else { // Generate the max window. - for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim) + for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim) { win.set(dim, Window::Dimension(0, shape[dim], 1)); } diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h index eccf7f2d18..e404c18e8a 100644 --- a/src/core/helpers/WindowHelpers.h +++ b/src/core/helpers/WindowHelpers.h @@ -43,21 +43,13 @@ namespace arm_compute * influence the returned value. */ template <typename... Ts> -bool update_window_and_padding(Window &win, Ts &&... patterns) +bool update_window_and_padding(Window &win, Ts &&...patterns) { bool window_changed = false; - utility::for_each([&](const IAccessWindow & w) - { - window_changed |= w.update_window_if_needed(win); - }, - patterns...); + utility::for_each([&](const IAccessWindow &w) { window_changed |= w.update_window_if_needed(win); }, patterns...); - utility::for_each([&](IAccessWindow & w) - { - w.update_padding_if_needed(win); - }, - patterns...); + utility::for_each([&](IAccessWindow &w) { w.update_padding_if_needed(win); }, patterns...); return window_changed; } @@ -69,18 +61,18 @@ bool update_window_and_padding(Window &win, Ts &&... patterns) * @return Intersection of all regions. */ template <typename... Ts> -ValidRegion intersect_valid_regions(const Ts &... regions) +ValidRegion intersect_valid_regions(const Ts &...regions) { - auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion + auto intersect = [](const ValidRegion &r1, const ValidRegion &r2) -> ValidRegion { ValidRegion region; - for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d) + for (size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d) { region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d])); } - for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d) + for (size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d) { region.shape.set(d, std::min(r1.shape[d], r2.shape[d])); } @@ -101,7 +93,10 @@ ValidRegion intersect_valid_regions(const Ts &... regions) * * @return The maximum window the kernel can be executed on. */ -Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()); +Window calculate_max_window(const ValidRegion &valid_region, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()); /** Calculate the maximum window for a given tensor shape and border setting * @@ -112,7 +107,10 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps * * @return The maximum window the kernel can be executed on. */ -Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()); +Window calculate_max_window(const TensorShape &shape, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()); /** Calculate the maximum window for a given tensor shape and border setting * @@ -123,7 +121,10 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps * * @return The maximum window the kernel can be executed on. */ -inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()) +inline Window calculate_max_window(const ITensorInfo &info, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()) { return calculate_max_window(info.tensor_shape(), steps, skip_border, border_size); } @@ -137,7 +138,10 @@ inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = * * @return The maximum window the kernel can be executed on. */ -Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()); +Window calculate_max_window_horizontal(const ValidRegion &valid_region, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()); /** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting * @@ -148,7 +152,10 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St * * @return The maximum window the kernel can be executed on. */ -inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize()) +inline Window calculate_max_window_horizontal(const ITensorInfo &info, + const Steps &steps = Steps(), + bool skip_border = false, + BorderSize border_size = BorderSize()) { return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size); } @@ -161,7 +168,9 @@ inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Ste * * @return The maximum window the kernel can be executed on. */ -Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize()); +Window calculate_max_enlarged_window(const ValidRegion &valid_region, + const Steps &steps = Steps(), + BorderSize border_size = BorderSize()); /** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border. * @@ -171,7 +180,9 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step * * @return The maximum window the kernel can be executed on. */ -inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize()) +inline Window calculate_max_enlarged_window(const ITensorInfo &info, + const Steps &steps = Steps(), + BorderSize border_size = BorderSize()) { return calculate_max_enlarged_window(info.valid_region(), steps, border_size); } @@ -208,7 +219,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr * @return A pair of the shape and window */ template <typename... Shapes> -std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &... shapes) +std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &...shapes) { const TensorShape out_shape = TensorShape::broadcast_shape(shapes...); return std::make_pair(out_shape, calculate_max_window(out_shape)); diff --git a/src/core/utils/ActivationFunctionUtils.cpp b/src/core/utils/ActivationFunctionUtils.cpp index 4854b8eb0b..017170a0c5 100644 --- a/src/core/utils/ActivationFunctionUtils.cpp +++ b/src/core/utils/ActivationFunctionUtils.cpp @@ -28,26 +28,24 @@ namespace arm_compute { -const std::string &string_from_activation_func(const ActivationFunction& act) +const std::string &string_from_activation_func(const ActivationFunction &act) { - static std::map<ActivationFunction, const std::string> act_map = - { - { ActivationFunction::ABS, "ABS" }, - { ActivationFunction::LINEAR, "LINEAR" }, - { ActivationFunction::LOGISTIC, "LOGISTIC" }, - { ActivationFunction::RELU, "RELU" }, - { ActivationFunction::BOUNDED_RELU, "BRELU" }, - { ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU" }, - { ActivationFunction::LEAKY_RELU, "LRELU" }, - { ActivationFunction::SOFT_RELU, "SRELU" }, - { ActivationFunction::ELU, "ELU" }, - { ActivationFunction::SQRT, "SQRT" }, - { ActivationFunction::SQUARE, "SQUARE" }, - { ActivationFunction::TANH, "TANH" }, - { ActivationFunction::IDENTITY, "IDENTITY" }, - { ActivationFunction::HARD_SWISH, "HARD_SWISH" }, - { ActivationFunction::SWISH, "SWISH" }, - { ActivationFunction::GELU, "GELU" } + static std::map<ActivationFunction, const std::string> act_map = {{ActivationFunction::ABS, "ABS"}, + {ActivationFunction::LINEAR, "LINEAR"}, + {ActivationFunction::LOGISTIC, "LOGISTIC"}, + {ActivationFunction::RELU, "RELU"}, + {ActivationFunction::BOUNDED_RELU, "BRELU"}, + {ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU"}, + {ActivationFunction::LEAKY_RELU, "LRELU"}, + {ActivationFunction::SOFT_RELU, "SRELU"}, + {ActivationFunction::ELU, "ELU"}, + {ActivationFunction::SQRT, "SQRT"}, + {ActivationFunction::SQUARE, "SQUARE"}, + {ActivationFunction::TANH, "TANH"}, + {ActivationFunction::IDENTITY, "IDENTITY"}, + {ActivationFunction::HARD_SWISH, "HARD_SWISH"}, + {ActivationFunction::SWISH, "SWISH"}, + {ActivationFunction::GELU, "GELU"} }; diff --git a/src/core/utils/AssemblyUtils.cpp b/src/core/utils/AssemblyUtils.cpp index 6d483adc7f..d97ea42091 100644 --- a/src/core/utils/AssemblyUtils.cpp +++ b/src/core/utils/AssemblyUtils.cpp @@ -34,12 +34,12 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) arm_gemm::Activation gemm_act; // Early exit in case lower bound is other than 0, as it's not yet supported - if(act.b() != 0.f) + if (act.b() != 0.f) { return gemm_act; } - switch(act.activation()) + switch (act.activation()) { case ActivationLayerInfo::ActivationFunction::RELU: gemm_act.type = arm_gemm::Activation::Type::ReLU; @@ -63,17 +63,15 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info) { - return arm_conv::PaddingValues{ pad_stride_info.pad_left(), - pad_stride_info.pad_top(), - pad_stride_info.pad_right(), - pad_stride_info.pad_bottom() }; + return arm_conv::PaddingValues{pad_stride_info.pad_left(), pad_stride_info.pad_top(), pad_stride_info.pad_right(), + pad_stride_info.pad_bottom()}; } arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFormat &weight_format) { arm_gemm::WeightFormat gemm_weight_fromat; - switch(weight_format) + switch (weight_format) { case arm_compute::WeightFormat::UNSPECIFIED: gemm_weight_fromat = arm_gemm::WeightFormat::UNSPECIFIED; @@ -193,7 +191,7 @@ arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::Weigh { arm_compute::WeightFormat acl_weight_fromat; - switch(weight_format) + switch (weight_format) { case arm_gemm::WeightFormat::UNSPECIFIED: acl_weight_fromat = arm_compute::WeightFormat::UNSPECIFIED; diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h index 60bad3b618..7d0d37c4ef 100644 --- a/src/core/utils/AssemblyUtils.h +++ b/src/core/utils/AssemblyUtils.h @@ -25,6 +25,7 @@ #define UTILS_CORE_ASSEMBLY_UTILS_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/kernels/assembly/common.hpp" #include "src/cpu/kernels/assembly/arm_gemm.hpp" @@ -65,6 +66,6 @@ arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFo * @return Compute Library WeightFormat */ arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::WeightFormat &weight_format); -} // namespace assembly +} // namespace assembly_utils } // namespace arm_compute #endif /* UTILS_CORE_ASSEMBLY_UTILS_H */ diff --git a/src/core/utils/DataLayoutUtils.cpp b/src/core/utils/DataLayoutUtils.cpp index 4919b79a42..234bed71cb 100644 --- a/src/core/utils/DataLayoutUtils.cpp +++ b/src/core/utils/DataLayoutUtils.cpp @@ -29,11 +29,10 @@ namespace arm_compute const std::string &string_from_data_layout(DataLayout dl) { - static std::map<DataLayout, const std::string> dl_map = - { - { DataLayout::UNKNOWN, "UNKNOWN" }, - { DataLayout::NCHW, "NCHW" }, - { DataLayout::NHWC, "NHWC" }, + static std::map<DataLayout, const std::string> dl_map = { + {DataLayout::UNKNOWN, "UNKNOWN"}, + {DataLayout::NCHW, "NCHW"}, + {DataLayout::NHWC, "NHWC"}, }; return dl_map[dl]; diff --git a/src/core/utils/DataTypeUtils.cpp b/src/core/utils/DataTypeUtils.cpp index 07999354d9..1394339987 100644 --- a/src/core/utils/DataTypeUtils.cpp +++ b/src/core/utils/DataTypeUtils.cpp @@ -30,27 +30,26 @@ namespace arm_compute { const std::string &string_from_data_type(DataType dt) { - static std::map<DataType, const std::string> dt_map = - { - { DataType::UNKNOWN, "UNKNOWN" }, - { DataType::S8, "S8" }, - { DataType::U8, "U8" }, - { DataType::S16, "S16" }, - { DataType::U16, "U16" }, - { DataType::S32, "S32" }, - { DataType::U32, "U32" }, - { DataType::S64, "S64" }, - { DataType::U64, "U64" }, - { DataType::F16, "F16" }, - { DataType::F32, "F32" }, - { DataType::F64, "F64" }, - { DataType::SIZET, "SIZET" }, - { DataType::QSYMM8, "QSYMM8" }, - { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" }, - { DataType::QASYMM8, "QASYMM8" }, - { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" }, - { DataType::QSYMM16, "QSYMM16" }, - { DataType::QASYMM16, "QASYMM16" }, + static std::map<DataType, const std::string> dt_map = { + {DataType::UNKNOWN, "UNKNOWN"}, + {DataType::S8, "S8"}, + {DataType::U8, "U8"}, + {DataType::S16, "S16"}, + {DataType::U16, "U16"}, + {DataType::S32, "S32"}, + {DataType::U32, "U32"}, + {DataType::S64, "S64"}, + {DataType::U64, "U64"}, + {DataType::F16, "F16"}, + {DataType::F32, "F32"}, + {DataType::F64, "F64"}, + {DataType::SIZET, "SIZET"}, + {DataType::QSYMM8, "QSYMM8"}, + {DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL"}, + {DataType::QASYMM8, "QASYMM8"}, + {DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED"}, + {DataType::QSYMM16, "QSYMM16"}, + {DataType::QASYMM16, "QASYMM16"}, }; return dt_map[dt]; @@ -58,12 +57,11 @@ const std::string &string_from_data_type(DataType dt) DataType data_type_from_name(const std::string &name) { - static const std::map<std::string, DataType> data_types = - { - { "f16", DataType::F16 }, - { "f32", DataType::F32 }, - { "qasymm8", DataType::QASYMM8 }, - { "qasymm8_signed", DataType::QASYMM8_SIGNED }, + static const std::map<std::string, DataType> data_types = { + {"f16", DataType::F16}, + {"f32", DataType::F32}, + {"qasymm8", DataType::QASYMM8}, + {"qasymm8_signed", DataType::QASYMM8_SIGNED}, }; #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED @@ -74,7 +72,7 @@ DataType data_type_from_name(const std::string &name) #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED } - catch(const std::out_of_range &) + catch (const std::out_of_range &) { ARM_COMPUTE_ERROR_VAR("Invalid data type name: %s", name.c_str()); } diff --git a/src/core/utils/FormatUtils.cpp b/src/core/utils/FormatUtils.cpp index 05b649ded2..46f8455315 100644 --- a/src/core/utils/FormatUtils.cpp +++ b/src/core/utils/FormatUtils.cpp @@ -30,26 +30,16 @@ namespace arm_compute { const std::string &string_from_format(Format format) { - static std::map<Format, const std::string> formats_map = - { - { Format::UNKNOWN, "UNKNOWN" }, - { Format::U8, "U8" }, - { Format::S16, "S16" }, - { Format::U16, "U16" }, - { Format::S32, "S32" }, - { Format::U32, "U32" }, - { Format::F16, "F16" }, - { Format::F32, "F32" }, - { Format::UV88, "UV88" }, - { Format::RGB888, "RGB888" }, - { Format::RGBA8888, "RGBA8888" }, - { Format::YUV444, "YUV444" }, - { Format::YUYV422, "YUYV422" }, - { Format::NV12, "NV12" }, - { Format::NV21, "NV21" }, - { Format::IYUV, "IYUV" }, - { Format::UYVY422, "UYVY422" } - }; + static std::map<Format, const std::string> formats_map = { + {Format::UNKNOWN, "UNKNOWN"}, {Format::U8, "U8"}, + {Format::S16, "S16"}, {Format::U16, "U16"}, + {Format::S32, "S32"}, {Format::U32, "U32"}, + {Format::F16, "F16"}, {Format::F32, "F32"}, + {Format::UV88, "UV88"}, {Format::RGB888, "RGB888"}, + {Format::RGBA8888, "RGBA8888"}, {Format::YUV444, "YUV444"}, + {Format::YUYV422, "YUYV422"}, {Format::NV12, "NV12"}, + {Format::NV21, "NV21"}, {Format::IYUV, "IYUV"}, + {Format::UYVY422, "UYVY422"}}; return formats_map[format]; } diff --git a/src/core/utils/InterpolationPolicyUtils.cpp b/src/core/utils/InterpolationPolicyUtils.cpp index 2d6cabe85e..276e760544 100644 --- a/src/core/utils/InterpolationPolicyUtils.cpp +++ b/src/core/utils/InterpolationPolicyUtils.cpp @@ -29,11 +29,10 @@ namespace arm_compute const std::string &string_from_interpolation_policy(InterpolationPolicy policy) { - static std::map<InterpolationPolicy, const std::string> interpolation_policy_map = - { - { InterpolationPolicy::AREA, "AREA" }, - { InterpolationPolicy::BILINEAR, "BILINEAR" }, - { InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR" }, + static std::map<InterpolationPolicy, const std::string> interpolation_policy_map = { + {InterpolationPolicy::AREA, "AREA"}, + {InterpolationPolicy::BILINEAR, "BILINEAR"}, + {InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR"}, }; return interpolation_policy_map[policy]; diff --git a/src/core/utils/ScaleUtils.cpp b/src/core/utils/ScaleUtils.cpp index ee57a8e7a7..a92da39b67 100644 --- a/src/core/utils/ScaleUtils.cpp +++ b/src/core/utils/ScaleUtils.cpp @@ -23,11 +23,12 @@ */ #include "src/core/utils/ScaleUtils.h" -#include "src/common/cpuinfo/CpuIsaInfo.h" #include "arm_compute/core/CPP/CPPTypes.h" #include "arm_compute/core/TensorInfo.h" +#include "src/common/cpuinfo/CpuIsaInfo.h" + float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners) { const size_t offset = (align_corners && output_size > 1) ? 1 : 0; @@ -40,13 +41,15 @@ float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t return static_cast<float>(in) / static_cast<float>(out); } -bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout, DataType data_type, - InterpolationPolicy policy, BorderMode border_mode) +bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout, + DataType data_type, + InterpolationPolicy policy, + BorderMode border_mode) { // Do not calculate precomputed weights and indices if kernel code doesn't use them - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { - switch(data_type) + switch (data_type) { case DataType::F32: case DataType::F16: @@ -62,4 +65,4 @@ bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout } return true; -}
\ No newline at end of file +} diff --git a/src/core/utils/ScaleUtils.h b/src/core/utils/ScaleUtils.h index 1484824a7f..d8dddc8c70 100644 --- a/src/core/utils/ScaleUtils.h +++ b/src/core/utils/ScaleUtils.h @@ -60,8 +60,11 @@ inline bool is_align_corners_allowed_sampling_policy(SamplingPolicy sampling_pol * * @return True if precomputation is required */ -bool is_precomputation_required(DataLayout data_layout, DataType data_type, InterpolationPolicy policy, BorderMode border_mode); +bool is_precomputation_required(DataLayout data_layout, + DataType data_type, + InterpolationPolicy policy, + BorderMode border_mode); } // namespace scale_utils } // namespace arm_compute -#endif /* UTILS_CORE_SCALEUTILS_H */
\ No newline at end of file +#endif /* UTILS_CORE_SCALEUTILS_H */ diff --git a/src/core/utils/StringUtils.cpp b/src/core/utils/StringUtils.cpp index 6d05c9b64e..bcab0ce10c 100644 --- a/src/core/utils/StringUtils.cpp +++ b/src/core/utils/StringUtils.cpp @@ -55,7 +55,7 @@ std::string float_to_string_with_full_precision(float val) ss.precision(std::numeric_limits<float>::max_digits10); ss << val; - if(val != static_cast<int>(val)) + if (val != static_cast<int>(val)) { ss << "f"; } @@ -65,17 +65,11 @@ std::string float_to_string_with_full_precision(float val) std::string join(const std::vector<std::string> strings, const std::string &sep) { - if(strings.empty()) + if (strings.empty()) { return ""; } - return std::accumulate( - std::next(strings.begin()), - strings.end(), - strings.at(0), - [&sep](const std::string & a, const std::string & b) - { - return a + sep + b; - }); -} + return std::accumulate(std::next(strings.begin()), strings.end(), strings.at(0), + [&sep](const std::string &a, const std::string &b) { return a + sep + b; }); } +} // namespace arm_compute diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp index 64633c643d..edc8d0eacc 100644 --- a/src/core/utils/helpers/fft.cpp +++ b/src/core/utils/helpers/fft.cpp @@ -37,7 +37,7 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign unsigned int res = N; // Early exit if no supported factors are provided - if(supported_factors.empty()) + if (supported_factors.empty()) { return stages; } @@ -46,10 +46,10 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign auto rfactor_it = supported_factors.rbegin(); // Decomposition step - while(res != 0) + while (res != 0) { const unsigned int factor = *rfactor_it; - if(0 == (res % factor) && res >= factor) + if (0 == (res % factor) && res >= factor) { stages.push_back(factor); res /= factor; @@ -57,9 +57,9 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign else { ++rfactor_it; - if(rfactor_it == supported_factors.rend()) + if (rfactor_it == supported_factors.rend()) { - if(res > 1) + if (res > 1) { // Couldn't decompose with given factors stages.clear(); @@ -81,8 +81,9 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto std::vector<unsigned int> idx_digit_reverse; // Early exit in case N and fft stages do not match - const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>()); - if(stages_prod != N) + const float stages_prod = + std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>()); + if (stages_prod != N) { return idx_digit_reverse; } @@ -94,13 +95,13 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto unsigned int n_stages = fft_stages.size(); // Scan elements - for(unsigned int n = 0; n < N; ++n) + for (unsigned int n = 0; n < N; ++n) { unsigned int k = n; unsigned int Nx = fft_stages[0]; // Scan stages - for(unsigned int s = 1; s < n_stages; ++s) + for (unsigned int s = 1; s < n_stages; ++s) { // radix of stage i-th unsigned int Ny = fft_stages[s]; diff --git a/src/core/utils/helpers/float_ops.h b/src/core/utils/helpers/float_ops.h index 99e1ea54ee..7f7fbd13bf 100644 --- a/src/core/utils/helpers/float_ops.h +++ b/src/core/utils/helpers/float_ops.h @@ -39,8 +39,7 @@ union RawFloat * * @param[in] val Floating-point value */ - explicit RawFloat(float val) - : f32(val) + explicit RawFloat(float val) : f32(val) { } /** Extract sign of floating point number diff --git a/src/core/utils/helpers/tensor_info.h b/src/core/utils/helpers/tensor_info.h index 9279532e2a..fd4745a453 100644 --- a/src/core/utils/helpers/tensor_info.h +++ b/src/core/utils/helpers/tensor_info.h @@ -41,15 +41,17 @@ namespace tensor_info * @return True if tensors have mismatching quantization info else false. */ template <typename... Ts> -inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos) +inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, + const ITensorInfo *tensor_info_2, + Ts... tensor_infos) { const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info(); - const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } }; - return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info) - { - return tensor_info->quantization_info() != first_quantization_info; - }); + const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{ + {tensor_info_2, std::forward<Ts>(tensor_infos)...}}; + return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), + [&](const ITensorInfo *tensor_info) + { return tensor_info->quantization_info() != first_quantization_info; }); } } // namespace tensor_info } // namespace helpers diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp index f2216995a9..19d0badd74 100644 --- a/src/core/utils/helpers/tensor_transform.cpp +++ b/src/core/utils/helpers/tensor_transform.cpp @@ -36,10 +36,11 @@ int calculate_stride_on_index(int index, Coordinates strides) return index >= static_cast<int>(strides.num_dimensions()) ? 1 : strides[index]; } -int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask) +int calculate_start_on_index( + TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask) { // Early exit - if(index >= static_cast<int>(starts.num_dimensions())) + if (index >= static_cast<int>(starts.num_dimensions())) { return 0; } @@ -51,14 +52,14 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta int start = starts[index]; // Reset in case of begin mask present - if(arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index)) + if (arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index)) { start = stride > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max(); } // Account negative start points const int dim_size = input_shape[index]; - if(start < 0) + if (start < 0) { start += dim_size; } @@ -69,12 +70,16 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta return start; } -int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index, - Coordinates ends, Coordinates strides, - int32_t end_mask, int32_t shrink_axis_mask) +int calculate_end_on_index(TensorShape input_shape, + int index, + int start_on_index, + Coordinates ends, + Coordinates strides, + int32_t end_mask, + int32_t shrink_axis_mask) { // Early exit - if(index >= static_cast<int>(ends.num_dimensions())) + if (index >= static_cast<int>(ends.num_dimensions())) { return input_shape[index]; } @@ -86,9 +91,9 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde int stop = ends[index]; // Shrink dimension - if(shrink_axis) + if (shrink_axis) { - if(start_on_index == std::numeric_limits<int>::max()) + if (start_on_index == std::numeric_limits<int>::max()) { stop = start_on_index; } @@ -99,14 +104,14 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde } // Reset in case of begin mask present - if(arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis) + if (arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis) { stop = (stride > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest(); } // Account negative end points const int dim_size = input_shape[index]; - if(stop < 0) + if (stop < 0) { stop += dim_size; } @@ -118,14 +123,18 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde } std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape, - Coordinates starts, Coordinates ends, Coordinates strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) + Coordinates starts, + Coordinates ends, + Coordinates strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { Coordinates starts_abs{}; Coordinates ends_abs{}; Coordinates final_strides{}; - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask); starts_abs.set(i, start_i); @@ -136,13 +145,19 @@ std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords return std::make_tuple(starts_abs, ends_abs, final_strides); } -TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask, bool return_unshrinked) +TensorShape compute_strided_slice_output_shape(TensorShape input_shape, + Coordinates starts, + Coordinates ends, + Coordinates strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask, + bool return_unshrinked) { unsigned int index = 0; TensorShape output_shape; - for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i) + for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i) { const int stride = calculate_stride_on_index(index, strides); const int start = calculate_start_on_index(input_shape, i, starts, strides, begin_mask); @@ -150,11 +165,11 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina const int range = end - start; const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i); - if(return_unshrinked || !is_shrink) + if (return_unshrinked || !is_shrink) { - if((range == 0) || // Zero range - (range < 0 && stride >= 0) || // Negative range with positive stride - (range > 0 && stride <= 0)) // Positive range with negative stride + if ((range == 0) || // Zero range + (range < 0 && stride >= 0) || // Negative range with positive stride + (range > 0 && stride <= 0)) // Positive range with negative stride { output_shape.set(index, 0); return output_shape; @@ -173,9 +188,9 @@ int32_t construct_slice_end_mask(Coordinates ends) { // Create end mask int32_t end_mask = 0; - for(unsigned int i = 0; i < ends.num_dimensions(); ++i) + for (unsigned int i = 0; i < ends.num_dimensions(); ++i) { - if(ends[i] < 0) + if (ends[i] < 0) { end_mask |= 1 << i; } diff --git a/src/core/utils/io/FileHandler.cpp b/src/core/utils/io/FileHandler.cpp index 95fc2e3fa2..d106493238 100644 --- a/src/core/utils/io/FileHandler.cpp +++ b/src/core/utils/io/FileHandler.cpp @@ -21,16 +21,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include <string> - #include "arm_compute/core/utils/io/FileHandler.h" #include "arm_compute/core/Error.h" +#include <string> + using namespace arm_compute::io; -FileHandler::FileHandler() - : _filestream(), _filename(" "), _mode() +FileHandler::FileHandler() : _filestream(), _filename(" "), _mode() { } diff --git a/src/core/utils/logging/FilePrinter.cpp b/src/core/utils/logging/FilePrinter.cpp index 55e78f9630..7b4eead38d 100644 --- a/src/core/utils/logging/FilePrinter.cpp +++ b/src/core/utils/logging/FilePrinter.cpp @@ -25,8 +25,7 @@ using namespace arm_compute::logging; -FilePrinter::FilePrinter(const std::string &filename) - : _handler() +FilePrinter::FilePrinter(const std::string &filename) : _handler() { _handler.open(filename, std::fstream::out | std::fstream::trunc); } @@ -34,4 +33,4 @@ FilePrinter::FilePrinter(const std::string &filename) void FilePrinter::print_internal(const std::string &msg) { _handler.stream() << msg << std::endl; -}
\ No newline at end of file +} diff --git a/src/core/utils/logging/Helpers.cpp b/src/core/utils/logging/Helpers.cpp index c3df7f6207..14ad910562 100644 --- a/src/core/utils/logging/Helpers.cpp +++ b/src/core/utils/logging/Helpers.cpp @@ -30,13 +30,12 @@ using namespace arm_compute::logging; const std::string &arm_compute::logging::string_from_log_level(LogLevel log_level) { - static std::map<LogLevel, const std::string> log_level_map = - { - { LogLevel::VERBOSE, "VERBOSE" }, - { LogLevel::INFO, "INFO" }, - { LogLevel::WARN, "WARN" }, - { LogLevel::OFF, "OFF" }, + static std::map<LogLevel, const std::string> log_level_map = { + {LogLevel::VERBOSE, "VERBOSE"}, + {LogLevel::INFO, "INFO"}, + {LogLevel::WARN, "WARN"}, + {LogLevel::OFF, "OFF"}, }; return log_level_map[log_level]; -}
\ No newline at end of file +} diff --git a/src/core/utils/logging/Logger.cpp b/src/core/utils/logging/Logger.cpp index 70b5868da8..d6681f8179 100644 --- a/src/core/utils/logging/Logger.cpp +++ b/src/core/utils/logging/Logger.cpp @@ -30,10 +30,7 @@ using namespace arm_compute::logging; Logger::Logger(std::string name, LogLevel log_level, std::shared_ptr<Printer> printer) - : _name(std::move(name)), _log_level(log_level), _printers( -{ - std::move(printer) -}), _decorators() + : _name(std::move(name)), _log_level(log_level), _printers({std::move(printer)}), _decorators() { // Check printer ARM_COMPUTE_ERROR_ON(printer == nullptr); @@ -46,7 +43,7 @@ Logger::Logger(std::string name, LogLevel log_level, std::vector<std::shared_ptr : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators() { // Check printers - for(const auto &p : _printers) + for (const auto &p : _printers) { ARM_COMPUTE_UNUSED(p); ARM_COMPUTE_ERROR_ON(p == nullptr); @@ -62,13 +59,13 @@ Logger::Logger(std::string name, : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators(std::move(decorators)) { // Check printers - for(const auto &p : _printers) + for (const auto &p : _printers) { ARM_COMPUTE_UNUSED(p); ARM_COMPUTE_ERROR_ON(p == nullptr); } // Check decorators - for(const auto &d : _decorators) + for (const auto &d : _decorators) { ARM_COMPUTE_UNUSED(d); ARM_COMPUTE_ERROR_ON(d == nullptr); @@ -79,7 +76,7 @@ void Logger::log(LogLevel log_level, const std::string &msg) { // Return if message shouldn't be logged // i.e. if log level does not match the logger's - if(!is_loggable(log_level)) + if (!is_loggable(log_level)) { return; } @@ -129,7 +126,7 @@ bool Logger::is_loggable(LogLevel log_level) void Logger::decorate_log_msg(LogMsg &msg) { - for(const auto &d : _decorators) + for (const auto &d : _decorators) { d->decorate(msg); } @@ -148,7 +145,7 @@ std::string Logger::create_log_msg(const std::string &str, LogLevel log_level) void Logger::print_all(const std::string &msg) { - for(auto &p : _printers) + for (auto &p : _printers) { p->print(msg); } diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp index c281d8863c..17015d9ae9 100644 --- a/src/core/utils/logging/LoggerRegistry.cpp +++ b/src/core/utils/logging/LoggerRegistry.cpp @@ -24,15 +24,15 @@ #include "arm_compute/core/utils/logging/LoggerRegistry.h" #include "arm_compute/core/Error.h" + #include "support/Mutex.h" using namespace arm_compute::logging; /** Reserved logger used by the library */ -std::set<std::string> LoggerRegistry::_reserved_loggers = { "CORE", "RUNTIME", "GRAPH" }; +std::set<std::string> LoggerRegistry::_reserved_loggers = {"CORE", "RUNTIME", "GRAPH"}; -LoggerRegistry::LoggerRegistry() - : _mtx(), _loggers() +LoggerRegistry::LoggerRegistry() : _mtx(), _loggers() { } @@ -42,10 +42,12 @@ LoggerRegistry &LoggerRegistry::get() return _instance; } -void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers) +void LoggerRegistry::create_logger(const std::string &name, + LogLevel log_level, + const std::vector<std::shared_ptr<Printer>> &printers) { arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx); - if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end())) + if ((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end())) { _loggers[name] = std::make_shared<Logger>(name, log_level, printers); } @@ -54,7 +56,7 @@ void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, void LoggerRegistry::remove_logger(const std::string &name) { arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx); - if(_loggers.find(name) != _loggers.end()) + if (_loggers.find(name) != _loggers.end()) { _loggers.erase(name); } @@ -69,9 +71,9 @@ std::shared_ptr<Logger> LoggerRegistry::logger(const std::string &name) void LoggerRegistry::create_reserved_loggers(LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers) { arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx); - for(const auto &r : _reserved_loggers) + for (const auto &r : _reserved_loggers) { - if(_loggers.find(r) == _loggers.end()) + if (_loggers.find(r) == _loggers.end()) { _loggers[r] = std::make_shared<Logger>(r, log_level, printers); } diff --git a/src/core/utils/misc/MMappedFile.cpp b/src/core/utils/misc/MMappedFile.cpp index adae8a2bf0..a467cb3320 100644 --- a/src/core/utils/misc/MMappedFile.cpp +++ b/src/core/utils/misc/MMappedFile.cpp @@ -27,12 +27,11 @@ #include <cstdio> #include <cstring> -#include <tuple> - #include <fcntl.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> +#include <tuple> #include <unistd.h> namespace arm_compute @@ -53,7 +52,7 @@ std::pair<size_t, bool> get_file_size(const std::string &filename) { struct stat st; // NOLINT memset(&st, 0, sizeof(struct stat)); - if(stat(filename.c_str(), &st) == 0) + if (stat(filename.c_str(), &st) == 0) { return std::make_pair(st.st_size, true); } @@ -73,8 +72,7 @@ size_t get_page_size() } } // namespace -MMappedFile::MMappedFile() - : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr) +MMappedFile::MMappedFile() : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr) { } @@ -92,14 +90,14 @@ MMappedFile::~MMappedFile() bool MMappedFile::map(const std::string &filename, size_t size, size_t offset) { // Check if file is mapped - if(is_mapped()) + if (is_mapped()) { return false; } // Open file _fp = fopen(filename.c_str(), "a+be"); - if(_fp == nullptr) + if (_fp == nullptr) { return false; } @@ -107,26 +105,26 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset) // Extract file descriptor int fd = fileno(_fp); bool status = fd >= 0; - if(status) + if (status) { // Get file size std::tie(_file_size, status) = get_file_size(_filename); - if(status) + if (status) { // Map all file from offset if map size is 0 _map_size = (size == 0) ? _file_size : size; _map_offset = offset; // Check offset mapping - if((_map_offset > _file_size) || (_map_offset % get_page_size() != 0)) + if ((_map_offset > _file_size) || (_map_offset % get_page_size() != 0)) { status = false; } else { // Truncate to file size - if(_map_offset + _map_size > _file_size) + if (_map_offset + _map_size > _file_size) { _map_size = _file_size - _map_offset; } @@ -137,7 +135,7 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset) } } - if(!status) + if (!status) { fclose(_fp); } @@ -148,14 +146,14 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset) void MMappedFile::release() { // Unmap file - if(_data != nullptr) + if (_data != nullptr) { ::munmap(_data, _file_size); _data = nullptr; } // Close file - if(_fp != nullptr) + if (_fp != nullptr) { fclose(_fp); _fp = nullptr; diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index 086d63b968..f66d3e7064 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/utils/quantization/AsymmHelpers.h" #include "support/ToolchainSupport.h" @@ -40,7 +42,7 @@ constexpr float epsilon = 0.00001f; Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon) { - if(multiplier >= 1.f) + if (multiplier >= 1.f) { Status status = calculate_quantized_multiplier_greater_than_one(multiplier, quant_multiplier, shift); *shift *= -1; @@ -69,13 +71,13 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, *right_shift = -1 * shift_exp; auto q_fixed = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0)); ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0); - if(q_fixed == fixed_point_one_Q0) + if (q_fixed == fixed_point_one_Q0) { q_fixed /= 2; --*right_shift; } - if(ignore_epsilon && *right_shift > 31) + if (ignore_epsilon && *right_shift > 31) { *right_shift = 0; q_fixed = 0; @@ -88,9 +90,8 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, return Status{}; } -Status calculate_quantized_multiplier_greater_than_one(float multiplier, - int32_t *quantized_multiplier, - int32_t *left_shift) +Status +calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift) { ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr); ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr); @@ -101,7 +102,7 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, *left_shift = shift_exp; auto q_fixed = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0)); ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0); - if(q_fixed == fixed_point_one_Q0) + if (q_fixed == fixed_point_one_Q0) { q_fixed /= 2; ++*left_shift; @@ -113,9 +114,9 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, return Status{}; } -arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, - const QuantizationInfo &wq_info, - const QuantizationInfo &oq_info, +arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, + const QuantizationInfo &wq_info, + const QuantizationInfo &oq_info, GEMMLowpOutputStageInfo &stage_info) { ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty()); @@ -133,7 +134,7 @@ arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_i const float i_scale = iq_info.scale().at(0); const float o_scale = oq_info.scale().at(0); - for(unsigned int i = 0; i < size; ++i) + for (unsigned int i = 0; i < size; ++i) { const float multiplier = i_scale * w_scales[i] / o_scale; int32_t quant_multiplier = 0; @@ -154,7 +155,7 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty { int min_quant_val = 0; int max_quant_val = 0; - switch(data_type) + switch (data_type) { case DataType::QASYMM8: min_quant_val = std::numeric_limits<uint8_t>::min(); @@ -179,7 +180,9 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty return std::make_pair(min_quant_val, max_quant_val); } -std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type) +std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, + const ActivationLayerInfo &act_info, + DataType data_type) { ARM_COMPUTE_ERROR_ON(data_type != DataType::QASYMM8 && data_type != DataType::QASYMM8_SIGNED); @@ -190,20 +193,23 @@ std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const Quant const UniformQuantizationInfo q_unif = q_info.uniform(); - if(act_info.enabled()) + if (act_info.enabled()) { - switch(act_info.activation()) + switch (act_info.activation()) { case ActivationLayerInfo::ActivationFunction::RELU: type_min = q_unif.offset; break; case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: type_min = q_unif.offset; - type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info); + type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) + : quantize_qasymm8_signed(act_info.a(), q_info); break; case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info) : quantize_qasymm8_signed(act_info.b(), q_info); - type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info); + type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info) + : quantize_qasymm8_signed(act_info.b(), q_info); + type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) + : quantize_qasymm8_signed(act_info.a(), q_info); break; default: ARM_COMPUTE_ERROR("Activation function not supported."); @@ -226,7 +232,7 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const unsigned int num_filters = wq_info.scale().size(); - for(unsigned int i = 0; i < num_filters; ++i) + for (unsigned int i = 0; i < num_filters; ++i) { int32_t output_multiplier = 0; int32_t output_shift = 0; @@ -267,11 +273,11 @@ int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t sh int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v) { - if(exponent == 0) + if (exponent == 0) { return v; } - else if(exponent < 0) + else if (exponent < 0) { return rounding_divide_by_pow2(v, -exponent); } @@ -291,11 +297,14 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v) } } -void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift) +void get_invsqrt_quantized_multiplier_exp(int32_t input, + int32_t reverse_shift, + int32_t &output_inv_sqrt, + int32_t &output_shift) { ARM_COMPUTE_ERROR_ON(input < 0); - if(input <= 1) + if (input <= 1) { // dealing the inputs (0 and 1) separately to avoid overflow output_inv_sqrt = std::numeric_limits<std::int32_t>::max(); @@ -305,7 +314,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, // prepare input for fixed point operation and compute shift value output_shift = 11; - while(input >= (1 << 29)) + while (input >= (1 << 29)) { input /= 4; ++output_shift; @@ -334,9 +343,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, // multiplication of two fixed point numbers, defined for readability auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType b) -> FixedPointRawType - { - return saturating_rounding_doubling_highmul(a, b); - }; + { return saturating_rounding_doubling_highmul(a, b); }; // rescaling of fixed point to have dst_bit integer bits, defined for readability auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType @@ -347,17 +354,18 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, // 5 iterations of Newton-Raphson method for inverse square root - 1.5 * x_n = input/2 * (x_n)^3 constexpr int32_t num_iteration = 5; - for(int32_t i = 0; i < num_iteration; ++i) + for (int32_t i = 0; i < num_iteration; ++i) { const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position); - x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position); + x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), + 6, fixedpoint_position); } // fixed point representation of sqrt(1/2) const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250; x = fixed_point_mul(fixedpoint_half_sqrt_2, x); output_inv_sqrt = x; - if(output_shift < 0) + if (output_shift < 0) { output_inv_sqrt <<= -output_shift; output_shift = 0; @@ -365,5 +373,5 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, // convert right shift to left shift output_shift *= reverse_shift; } -} // quantization -} // arm_compute +} // namespace quantization +} // namespace arm_compute diff --git a/src/core/utils/quantization/AsymmHelpers.h b/src/core/utils/quantization/AsymmHelpers.h index f9701095cb..5dc607ce58 100644 --- a/src/core/utils/quantization/AsymmHelpers.h +++ b/src/core/utils/quantization/AsymmHelpers.h @@ -29,7 +29,8 @@ namespace arm_compute { -namespace quantization { +namespace quantization +{ /** Get minimum and maximum output of the activation function after quantization. * @@ -41,7 +42,9 @@ namespace quantization { * * @return The minimum and maximum output of the activation function after quantization. */ -std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type); +std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, + const ActivationLayerInfo &act_info, + DataType data_type); } // namespace quantization } // namespace arm_compute |