aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/ICLKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp21
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp33
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp13
-rw-r--r--src/core/CL/kernels/CLPoolingLayerKernel.cpp28
-rw-r--r--src/core/CL/kernels/CLScaleKernel.cpp16
-rw-r--r--src/core/CL/kernels/CLUpsampleLayerKernel.cpp12
-rw-r--r--src/core/CL/kernels/CLWinogradInputTransformKernel.cpp32
-rw-r--r--src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp7
-rw-r--r--src/core/NEON/kernels/NEIm2ColKernel.cpp19
-rw-r--r--src/core/NEON/kernels/NEPoolingLayerKernel.cpp22
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp38
-rw-r--r--src/core/NEON/kernels/NEScaleKernel.cpp4
-rw-r--r--src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp10
-rw-r--r--src/runtime/CL/functions/CLReduceMean.cpp120
-rw-r--r--src/runtime/NEON/functions/NEReduceMean.cpp97
16 files changed, 241 insertions, 233 deletions
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 2d28a496c9..d81ad46b29 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -98,7 +98,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
for(unsigned int n = 0; n < info->num_dimensions(); ++n)
{
- offset_first_element += window[n].start() * strides[n];
+ offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n];
}
unsigned int idx_start = idx;
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 295fb5c997..177f05f3ca 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,7 @@
using namespace arm_compute;
CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
- : _input(nullptr), _output(nullptr), _info()
+ : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
{
}
@@ -72,13 +72,14 @@ void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTe
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- _input = input;
- _output = output;
- _info = info;
-
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
+ _input = input;
+ _output = output;
+ _info = info;
+ _data_layout = input->info()->data_layout();
+
// Create kernel
CLBuildOptions build_opts;
build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
@@ -99,10 +100,8 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
const int out_start_x = _info.pad().first;
const int out_end_x = _output->info()->dimension(idx_w) - _info.pad().first + _info.stride().first - 1;
@@ -112,7 +111,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
const int out_end_y = _output->info()->dimension(idx_h) - _info.pad().second + _info.stride().second - 1;
const int out_step_y = _info.stride().second;
- switch(data_layout)
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index dc4c431c5d..21685dcf0e 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -377,7 +377,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
- : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
+ : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
{
}
@@ -390,10 +390,10 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- const DataLayout data_layout = input->info()->data_layout();
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ _data_layout = input->info()->data_layout();
+ const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const unsigned int kernel_size = weights->info()->dimension(width_idx);
const DataType data_type = input->info()->data_type();
@@ -419,11 +419,11 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
_conv_stride_x = std::get<0>(conv_info.stride());
_conv_stride_y = std::get<1>(conv_info.stride());
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
_border_size = BorderSize(conv_info.pad_left(), 0, conv_info.pad_right(), 0);
}
- else if(data_layout == DataLayout::NCHW)
+ else if(_data_layout == DataLayout::NCHW)
{
_border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
}
@@ -441,15 +441,15 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
std::stringstream kernel_name;
kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
- kernel_name << "_" << lower_string(string_from_data_layout(data_layout));
+ kernel_name << "_" << lower_string(string_from_data_layout(_data_layout));
}
CLBuildOptions build_options;
build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
- const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout);
+ const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
if(run_optimized_for_bifrost)
{
@@ -466,9 +466,9 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
- const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout);
+ const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
build_options.add_option(std::string("-DDATA_LAYOUT_NHWC=1"));
build_options.add_option(std::string("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx))));
build_options.add_option(std::string("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx))));
@@ -538,7 +538,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(height_idx));
_config_id += "_";
- _config_id += lower_string(string_from_data_layout(data_layout));
+ _config_id += lower_string(string_from_data_layout(_data_layout));
}
Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -562,9 +562,8 @@ void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue
win_in.adjust(Window::DimX, -_border_size.left, true);
win_in.adjust(Window::DimY, -_border_size.top, true);
- const DataLayout data_layout = _input->info()->data_layout();
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 10d6e68cd9..24f22c31a5 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -287,7 +287,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size
} // namespace
CLIm2ColKernel::CLIm2ColKernel()
- : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
+ : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
{
}
@@ -297,9 +297,10 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
- const DataLayout data_layout = input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ _data_layout = input->info()->data_layout();
+
+ const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
const unsigned int input_width = input->info()->dimension(width_idx);
const unsigned int input_height = input->info()->dimension(height_idx);
@@ -336,7 +337,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(1));
_config_id += "_";
- _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+ _config_id += lower_string(string_from_data_layout(_data_layout));
}
Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
@@ -369,7 +370,7 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
Window slice_in = first_slice_3d;
Window slice_out = window_output.first_slice_window_2D();
- if(_input->info()->data_layout() == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
const int num_batches = tmp_win[3].end();
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 8eaf5bf76f..032d451aad 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -172,7 +172,7 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
} // namespace
CLPoolingLayerKernel::CLPoolingLayerKernel()
- : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
+ : _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
{
}
@@ -185,13 +185,18 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ // Set instance variables
+ _input = input;
+ _output = output;
+ _pool_info = pool_info;
+ _data_layout = input->info()->data_layout();
+
int pool_stride_x = 0;
int pool_stride_y = 0;
const PoolingType pool_type = pool_info.pool_type();
- DataLayout data_layout = input->info()->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const int pool_size_x = pool_info.is_global_pooling() ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
const int pool_size_y = pool_info.is_global_pooling() ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
@@ -218,11 +223,6 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
auto_init(input->info(), output->info(), pool_info);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
- // Set instance variables
- _input = input;
- _output = output;
- _pool_info = pool_info;
-
const DataType data_type = input->info()->data_type();
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -237,7 +237,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
// Create kernel
- switch(data_layout)
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
@@ -286,7 +286,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
ICLKernel::configure_internal(std::get<1>(win_config));
- if(data_layout == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
CLPoolingConfig pooling_config = std::get<2>(win_config);
_num_elems_processed_per_iteration = pooling_config.first;
@@ -302,7 +302,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
_config_id = "pooling_layer_";
_config_id += lower_string(string_from_data_type(data_type));
_config_id += "_";
- _config_id += lower_string(string_from_data_layout(data_layout));
+ _config_id += lower_string(string_from_data_layout(_data_layout));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(idx_width));
_config_id += "_";
@@ -333,7 +333,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
// Collapse window
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- switch(_input->info()->data_layout())
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 488313fd12..82c5c8a446 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -160,11 +160,12 @@ const ICLTensor *CLScaleKernel::output() const
void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy));
+
_input = input;
_output = output;
_interpolationPolicy = policy;
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy));
+ _data_layout = input->info()->data_layout();
float wr = 0.f;
float hr = 0.f;
@@ -172,10 +173,9 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo
const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && policy == InterpolationPolicy::BILINEAR;
- DataLayout data_layout = input->info()->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const bool is_nhwc = _data_layout == DataLayout::NHWC;
// Compute the ratio between source width/height and destination width/height
const unsigned int input_width = input->info()->dimension(idx_width);
@@ -215,7 +215,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo
std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
std::string kernel_name = "scale_" + interpolation_name;
kernel_name += call_quantized_kernel ? "_quantized_" : "_";
- kernel_name += lower_string(string_from_data_layout(data_layout));
+ kernel_name += lower_string(string_from_data_layout(_data_layout));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
@@ -249,7 +249,7 @@ void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- switch(_input->info()->data_layout())
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index ce5ed86332..2ccd540788 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
@@ -37,7 +37,7 @@
namespace arm_compute
{
CLUpsampleLayerKernel::CLUpsampleLayerKernel()
- : _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_input_x()
+ : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration_input_x()
{
}
@@ -71,13 +71,12 @@ void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
_input = input;
_output = output;
_info = info;
+ _data_layout = input->info()->data_layout();
_num_elems_processed_per_iteration_input_x = 1;
- const DataLayout data_layout = input->info()->data_layout();
-
TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info);
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
- output->info()->set_data_layout(data_layout);
+ output->info()->set_data_layout(_data_layout);
unsigned int num_elems_processed_per_iteration_x = 16;
const int output_width_x = output->info()->dimension(0);
@@ -88,7 +87,7 @@ void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
Window win{};
- switch(data_layout)
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
@@ -140,8 +139,7 @@ void CLUpsampleLayerKernel::run(const Window &window, cl::CommandQueue &queue)
Window slice_out = collapsed_window.first_slice_window_3D();
Window slice_in = collapsed_window.first_slice_window_3D();
- DataLayout data_layout = _input->info()->data_layout();
- switch(data_layout)
+ switch(_data_layout)
{
case DataLayout::NCHW:
slice_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_input_x));
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index 1c31ceba99..6125790491 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -99,7 +99,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
CLWinogradInputTransformKernel::CLWinogradInputTransformKernel()
- : _border_size(0), _input(nullptr), _output(nullptr), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
+ : _border_size(0), _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
{
}
@@ -116,16 +116,17 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
const PadStrideInfo conv_info = winograd_info.convolution_info;
const Size2D output_tile_size = winograd_info.output_tile_size;
const Size2D kernel_size = winograd_info.kernel_size;
- const DataLayout data_layout = input->info()->data_layout();
- const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ _data_layout = input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
// Compute number of elements to process in the X and Y direction
const int num_elements_x = input->info()->dimension(idx_w) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
const int num_elements_y = input->info()->dimension(idx_h) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
- if(data_layout == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
// Check if we need to extend the right or bottom border
const unsigned int extra_border_right = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
@@ -166,7 +167,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
@@ -184,7 +185,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
// Check optimized kernel if output_dims == 2x2
- if((tile_max_dim == 2) && (data_layout == DataLayout::NCHW))
+ if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
{
_step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
}
@@ -192,7 +193,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
// Append stepz and data layout
kernel_name += "_stepz";
kernel_name += support::cpp11::to_string(_step_z);
- kernel_name += "_" + lower_string(string_from_data_layout(data_layout));
+ kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
@@ -212,7 +213,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
_config_id += "_";
_config_id += support::cpp11::to_string(conv_info.pad_top());
_config_id += "_";
- _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+ _config_id += lower_string(string_from_data_layout(_data_layout));
}
Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
@@ -229,11 +230,10 @@ void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const size_t total_batches = window.shape().total_size_upper(3);
+ const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const size_t total_batches = window.shape().total_size_upper(3);
// Collapse window
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -241,7 +241,7 @@ void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue
Window slice = window_collapsed.first_slice_window_3D();
slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1));
}
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index df631c3c03..98b0c106db 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -63,7 +63,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
} // namespace
NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+ : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
{
}
@@ -80,6 +80,7 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output,
_input = input;
_output = output;
_block_shape = block_shape;
+ _data_layout = input->info()->data_layout();
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
@@ -99,7 +100,7 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- const int idx_channel = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const int depth_size = _input->info()->dimension(idx_channel);
const int r = (depth_size / (_block_shape * _block_shape));
const int element_size = _input->info()->element_size();
@@ -112,7 +113,7 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
// Main loop for NCHW and NHWC
- if(_input->info()->data_layout() == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
Window slice_in = window.first_slice_window_2D();
do
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 0641d6cfa3..27c3d66b4f 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -265,10 +265,9 @@ void NEIm2ColKernel::run_im2col(const Window &window)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const int input_w = _input->info()->dimension(width_idx);
const int input_h = _input->info()->dimension(height_idx);
@@ -344,7 +343,7 @@ void NEIm2ColKernel::run_im2col(const Window &window)
}
NEIm2ColKernel::NEIm2ColKernel()
- : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U)
+ : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U), _data_layout(DataLayout::UNKNOWN)
{
}
@@ -355,9 +354,9 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
ARM_COMPUTE_UNUSED(num_groups);
- const DataLayout data_layout = input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ _data_layout = input->info()->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
_input = input;
_output = output;
@@ -370,7 +369,7 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
_conv_info, _dilation);
_has_bias = has_bias;
- if(data_layout == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
switch(_input->info()->data_type())
{
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 62c9ca0d5e..14de4a19d8 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -321,7 +321,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
NEPoolingLayerKernel::NEPoolingLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
+ : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
{
}
@@ -364,14 +364,15 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
// Set instance variables
- _input = input;
- _output = output;
- _pool_info = pool_info;
- _is_square = (pool_size.x() == pool_size.y());
+ _input = input;
+ _output = output;
+ _pool_info = pool_info;
+ _data_layout = input->info()->data_layout();
+ _is_square = (pool_size.x() == pool_size.y());
// Get data type
const DataType data_type = input->info()->data_type();
- const bool is_nchw = data_layout == DataLayout::NCHW;
+ const bool is_nchw = _data_layout == DataLayout::NCHW;
if(data_type == DataType::QASYMM8)
{
@@ -1574,7 +1575,12 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
// Calculate square-root in case of l2 pooling
if(pooling_type == PoolingType::L2)
{
- vres = vmulq_f32(vres, vinvsqrtq_f32(vres));
+ float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+ static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+ static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+ static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
+ };
+ vres = l2_res;
}
// Store result
@@ -1835,7 +1841,7 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
const bool exclude_padding = _pool_info.exclude_padding();
Window window_input(window);
- if(_input->info()->data_layout() == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
// Set step for input in x and y direction for the input
unsigned int window_x_inc = 0;
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index ffa4fa3565..16cd6f77b4 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -742,23 +742,8 @@ struct RedOpYZW
for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
- T *in_ptr;
- switch(axis)
- {
- case 1:
- in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
- break;
- case 2:
- in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
- break;
- case 3:
- in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
+ const T *in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.strides_in_bytes()[axis] * dim);
const auto vec_elements = wrapper::vloadq(in_ptr);
-
switch(op)
{
case ReductionOperation::SUM:
@@ -907,23 +892,8 @@ struct RedOpYZW_qasymm8
for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
{
- uint8_t *in_ptr;
- switch(axis)
- {
- case 1:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
- break;
- case 2:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
- break;
- case 3:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- const auto vec_elements = wrapper::vloadq(in_ptr);
-
+ const uint8_t *in_ptr = input.ptr() + in_info.strides_in_bytes()[axis] * index_dim;
+ const auto vec_elements = wrapper::vloadq(in_ptr);
switch(op)
{
case ReductionOperation::SUM:
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index a2a44fca18..5b8e196a2c 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -909,7 +909,7 @@ void NEScaleKernel::scale_area_nchw(const Window &window)
void NEScaleKernel::scale_nhwc(const Window &window)
{
// Get data layout and width/height indices
- const DataLayout data_layout = _input->info()->data_layout();
+ const DataLayout data_layout = DataLayout::NHWC;
const int idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 4803365013..ffd2dc14bf 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -66,7 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
} // namespace
NESpaceToDepthLayerKernel::NESpaceToDepthLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+ : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
{
}
@@ -82,6 +82,7 @@ void NESpaceToDepthLayerKernel::configure(const ITensor *input, ITensor *output,
_input = input;
_block_shape = block_shape;
_output = output;
+ _data_layout = input->info()->data_layout();
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
@@ -100,9 +101,8 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
- const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int element_size = _input->info()->element_size();
+ const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const int element_size = _input->info()->element_size();
const size_t channel_size = _input->info()->dimension(channel_idx);
@@ -111,7 +111,7 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
int batch_id = 0;
// Main loop for NCHW and NHWC
- if(_output->info()->data_layout() == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
do
{
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index a3634cd46e..c5de43da35 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -26,20 +26,81 @@
#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
namespace arm_compute
{
+namespace
+{
+Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
+
+ for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+ {
+ //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
+ }
+
+ if(output->tensor_shape().total_size() != 0)
+ {
+ // Only validate if not using auto_init for the output tensor
+ TensorShape out_shape = input->tensor_shape();
+ // Validate output_shape only if not using auto_init
+ convert_negative_axis(axis_local, input_dims);
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+ if(output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if(keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
+ const unsigned int remove_index = axis_local[i] - i;
+ ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
+ out_shape.remove_dimension(remove_index);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+ }
+ return Status{};
+}
+}
CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
{
}
void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
@@ -49,14 +110,10 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
Coordinates axis_local = reduction_axis;
const int input_dims = input->info()->num_dimensions();
- // Convert negative axis
- for(unsigned int i = 0; i < _reduction_ops; ++i)
- {
- axis_local[i] = wrap_around(axis_local[i], input_dims);
- }
+ convert_negative_axis(axis_local, input_dims);
// Perform reduction for every axis
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
@@ -75,7 +132,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
}
// Allocate intermediate tensors
- for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
{
_reduced_outs[i].allocator()->allocate();
}
@@ -88,7 +145,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
out_shape.remove_dimension(axis_local[i] - i);
}
@@ -99,55 +156,16 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
-
- TensorShape out_shape = input->tensor_shape();
-
- Coordinates axis_sorted = reduction_axis;
- const unsigned int reduction_ops = reduction_axis.num_dimensions();
- const int input_dims = input->num_dimensions();
-
- // Convert negative axis
- for(unsigned int i = 0; i < reduction_ops; ++i)
- {
- axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
- }
-
- std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
- for(unsigned int i = 0; i < reduction_ops; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
- if(output->total_size() > 0 && keep_dims)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
- }
- if(keep_dims)
- {
- out_shape.set(axis_sorted[i], 1);
- }
- else
- {
- out_shape.remove_dimension(axis_sorted[i] - i);
- }
- }
-
- const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
- return Status{};
+ return validate_config(input, reduction_axis, keep_dims, output);
}
void CLReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(auto &kernel : _reduction_kernels)
{
- _reduction_kernels[i].run();
+ kernel.run();
}
if(!_keep_dims)
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 0b145f034d..96ec8b8587 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -24,80 +24,97 @@
#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-using namespace arm_compute;
+namespace arm_compute
+{
NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
{
}
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
ARM_COMPUTE_UNUSED(keep_dims);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
- TensorShape out_shape = input->tensor_shape();
const unsigned int reduction_ops = reduction_axis.num_dimensions();
const int input_dims = input->num_dimensions();
Coordinates axis_local = reduction_axis;
- // Convert negative axis
- for(unsigned int i = 0; i < reduction_ops; ++i)
+ for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
{
- axis_local[i] = wrap_around(axis_local[i], input_dims);
+ //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
}
- std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
- for(unsigned int i = 0; i < reduction_ops; ++i)
+ if(output->tensor_shape().total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
- if(output->total_size() > 0 && keep_dims)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
- }
- if(keep_dims)
- {
- out_shape.set(axis_local[i], 1);
- }
- else
+ // Only validate if not using auto_init for the output tensor
+ TensorShape out_shape = input->tensor_shape();
+ // Validate output_shape only if not using auto_init
+ convert_negative_axis(axis_local, input_dims);
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
{
- out_shape.remove_dimension(axis_local[i] - i);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+ if(output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if(keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
+ const unsigned int remove_index = axis_local[i] - i;
+ ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
+ out_shape.remove_dimension(remove_index);
+ }
}
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
}
- const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
return Status{};
}
+Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+ return validate_config(input, reduction_axis, keep_dims, output);
+}
+
void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
_reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
_keep_dims = keep_dims;
- Coordinates axis_local = reduction_axis;
- const int input_dims = input->info()->num_dimensions();
- const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
- // Convert negative axis
- for(unsigned int i = 0; i < reduction_ops; ++i)
- {
- axis_local[i] = wrap_around(axis_local[i], input_dims);
- }
+ convert_negative_axis(axis_local, input_dims);
// Perform reduction for every axis
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
@@ -116,7 +133,7 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
}
// Allocate intermediate tensors
- for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
{
_reduced_outs[i].allocator()->allocate();
}
@@ -125,11 +142,10 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
if(!keep_dims)
{
TensorShape out_shape = input->info()->tensor_shape();
-
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
out_shape.remove_dimension(axis_local[i] - i);
}
@@ -141,10 +157,9 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
void NEReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
-
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(auto &kernel : _reduction_kernels)
{
- _reduction_kernels[i].run();
+ kernel.run();
}
if(!_keep_dims)
@@ -152,3 +167,5 @@ void NEReduceMean::run()
_reshape.run();
}
}
+
+} // namespace arm_compute