diff options
43 files changed, 414 insertions, 266 deletions
@@ -1,6 +1,6 @@ MIT License -Copyright (c) 2017-2019 ARM Software +Copyright (c) 2017-2020 ARM Software Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -31,6 +31,7 @@ Blogs: Documentation available here: +- [v19.08.1](https://arm-software.github.io/ComputeLibrary/v19.08.1/) - [v19.08](https://arm-software.github.io/ComputeLibrary/v19.08/) - [v19.05](https://arm-software.github.io/ComputeLibrary/v19.05/) - [v19.02](https://arm-software.github.io/ComputeLibrary/v19.02/) @@ -50,6 +51,8 @@ Documentation available here: Binaries available here: +- [v19.08.1-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08.1/arm_compute-v19.08.1-bin-linux.tar.gz) +- [v19.08.1-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08.1/arm_compute-v19.08.1-bin-android.tar.gz) - [v19.08-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08/arm_compute-v19.08-bin-linux.tar.gz) - [v19.08-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08/arm_compute-v19.08-bin-android.tar.gz) - [v19.05-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.05/arm_compute-v19.05-bin-linux.tar.gz) diff --git a/SConscript b/SConscript index 6c9b0bb796..e06262ec72 100644 --- a/SConscript +++ b/SConscript @@ -24,8 +24,8 @@ import os.path import re import subprocess -VERSION = "v19.08" -SONAME_VERSION="16.0.0" +VERSION = "v19.08.1" +SONAME_VERSION="16.1.0" Import('env') Import('vars') diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h index 21d026e0a1..2dd20e9588 100644 --- a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h +++ b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -72,6 +72,7 @@ private: const ICLTensor *_input; ICLTensor *_output; PadStrideInfo _info; + DataLayout _data_layout; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h index 081b01aad3..faf97e45dc 100644 --- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -94,6 +94,7 @@ public: const ICLTensor *_biases; const ICLTensor *_weights; ICLTensor *_output; + DataLayout _data_layout; BorderSize _border_size; int _conv_stride_x; int _conv_stride_y; diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h index 0647f5dcec..00cb416e90 100644 --- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h +++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h @@ -103,6 +103,7 @@ public: public: const ICLTensor *_input; ICLTensor *_output; + DataLayout _data_layout; std::pair<unsigned int, unsigned int> _convolved_dims; unsigned int _num_elems_processed_per_iteration; Size2D _kernel_dims; diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h index db1a756229..68a99039d8 100644 --- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h @@ -75,6 +75,7 @@ public: const ICLTensor *_input; ICLTensor *_output; PoolingLayerInfo _pool_info; + DataLayout _data_layout; BorderSize _border_size; unsigned int _num_elems_processed_per_iteration; }; diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h index ff72af29fc..1ada3cde85 100644 --- a/arm_compute/core/CL/kernels/CLScaleKernel.h +++ b/arm_compute/core/CL/kernels/CLScaleKernel.h @@ -75,6 +75,7 @@ public: public: InterpolationPolicy _interpolationPolicy = InterpolationPolicy::BILINEAR; + DataLayout _data_layout = DataLayout::UNKNOWN; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLSCALEKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h index dcd4f1bdb4..c8c69002c4 100644 --- a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h @@ -73,6 +73,7 @@ private: const ICLTensor *_input; ICLTensor *_output; Size2D _info; + DataLayout _data_layout; unsigned int _num_elems_processed_per_iteration_input_x; }; } // namespace arm_compute diff --git a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h index bc05a0ebf1..30bd3abb43 100644 --- a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h +++ b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h @@ -93,6 +93,7 @@ private: BorderSize _border_size; const ICLTensor *_input; ICLTensor *_output; + DataLayout _data_layout; int _num_tiles_x; int _num_tiles_y; unsigned int _step_z; diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h index 87b1fdf64c..8d526e96c0 100644 --- a/arm_compute/core/Helpers.h +++ b/arm_compute/core/Helpers.h @@ -766,6 +766,20 @@ inline T wrap_around(T x, T m) return x >= 0 ? x % m : (x % m + m) % m; } +/** Convert negative coordinates to positive in the range [0, num_dims_input] + * + * @param[out] coords Array of coordinates to be converted. + * @param[in] max_value Maximum value to be used when wrapping the negative values in coords + */ +inline Coordinates &convert_negative_axis(Coordinates &coords, int max_value) +{ + for(unsigned int i = 0; i < coords.num_dimensions(); ++i) + { + coords[i] = wrap_around(coords[i], max_value); + } + return coords; +} + /** Given an integer value, this function returns the next power of two * * @param[in] x Input value diff --git a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h index a62da049a5..b34f6d3ebf 100644 --- a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h @@ -74,6 +74,7 @@ private: const ITensor *_input; /**< Source tensor */ ITensor *_output; /**< Destination tensor */ int32_t _block_shape; /**< Block shape */ + DataLayout _data_layout; /**< Data layout of the operation */ }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h index f76521f770..689da857a7 100644 --- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h +++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -131,6 +131,7 @@ private: unsigned int _kernel_height; bool _has_bias; Size2D _dilation; + DataLayout _data_layout; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEIM2COLKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h index 5f45a90cef..5b143250e9 100644 --- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h @@ -194,6 +194,7 @@ private: const ITensor *_input; ITensor *_output; PoolingLayerInfo _pool_info; + DataLayout _data_layout; unsigned int _num_elems_processed_per_iteration; BorderSize _border_size; bool _is_square; diff --git a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h index c9ecdd26f8..68bc1737c8 100644 --- a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h @@ -75,6 +75,7 @@ private: const ITensor *_input; /**< Source tensor */ ITensor *_output; /**< Destination tensor */ int32_t _block_shape; /**< Block shape */ + DataLayout _data_layout; /**< Data layout of the operation */ }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H__ */ diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h index a56227996b..be42fe9a87 100644 --- a/arm_compute/core/Window.h +++ b/arm_compute/core/Window.h @@ -48,7 +48,7 @@ public: /** Default constructor: create a window containing a single element. */ constexpr Window() - : _dims() + : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value) { } /** Copy constructor @@ -170,6 +170,20 @@ public: */ void set(size_t dimension, const Dimension &dim); + /** Set the dimension as broadcasted dimension + * + * @param[in] dimension The dimension to set + */ + void set_broadcasted(size_t dimension); + + /** Return whether a dimension has been broadcasted + * + * @param[in] dimension The requested dimension + * + * @return true if the dimension has been broadcasted + */ + bool is_broadcasted(size_t dimension) const; + /** Use the tensor's dimensions to fill the window dimensions. * * @param[in] shape @ref TensorShape to copy the dimensions from. @@ -419,6 +433,7 @@ private: private: std::array<Dimension, Coordinates::num_max_dimensions> _dims; + std::array<bool, Coordinates::num_max_dimensions> _is_broadcasted; }; } // namespace arm_compute #include "Window.inl" diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl index eeef3df7b0..589d6bfafc 100644 --- a/arm_compute/core/Window.inl +++ b/arm_compute/core/Window.inl @@ -24,11 +24,12 @@ namespace arm_compute { inline Window::Window(const Window &src) - : _dims() + : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value) { for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i) { set(i, src[i]); + _is_broadcasted[i] = src.is_broadcasted(i); } } @@ -51,6 +52,19 @@ inline void Window::set(size_t dimension, const Window::Dimension &dim) _dims[dimension] = dim; } +inline void Window::set_broadcasted(size_t dimension) +{ + ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions); + set(dimension, Dimension(0, 0, 0)); + _is_broadcasted[dimension] = true; +} + +inline bool Window::is_broadcasted(size_t dimension) const +{ + ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions); + return _is_broadcasted[dimension]; +} + inline Window Window::collapse_if_possible(const Window &full_window, const size_t first, const size_t last, bool *has_collapsed) const { @@ -110,7 +124,7 @@ inline Window Window::broadcast_if_dimension_le_one(const TensorShape &shape) co { if(shape[d] <= 1) { - broadcastWin.set(d, Dimension(0, 0, 0)); + broadcastWin.set_broadcasted(d); } } return broadcastWin; diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index 65a2a1edf4..698a2b7a45 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -39,6 +39,42 @@ namespace misc { namespace shape_calculator { +/** Calculate the output tensor shape for the reduce mean operation + * + * @param[in] input Input tensor shape + * @param[in] reduction_axis Reduction axis + * @param[in] keep_dims Flag to indicate if dimensions are kept + * + * @return the calculated shape + */ +inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates &reduction_axis, bool keep_dims) +{ + const int reduction_ops = reduction_axis.num_dimensions(); + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + convert_negative_axis(axis_local, input_dims); + TensorShape out_shape = input->info()->tensor_shape(); + // Configure reshape layer if we want to drop the dimensions + if(!keep_dims) + { + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for(int i = 0; i < reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + return out_shape; + } + else + { + for(int i = 0; i < reduction_ops; ++i) + { + out_shape.set(axis_local[i], 1); + } + return out_shape; + } +} /** Calculate the output tensor shape of a vector input given the convolution dimensions * * @param[in] input Input tensor shape diff --git a/arm_compute/core/utils/misc/Utility.h b/arm_compute/core/utils/misc/Utility.h index 8dd9afd5cd..2325644e72 100644 --- a/arm_compute/core/utils/misc/Utility.h +++ b/arm_compute/core/utils/misc/Utility.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -53,6 +53,20 @@ struct index_sequence_generator<0u, S...> : index_sequence<S...> template <std::size_t N> using index_sequence_t = typename index_sequence_generator<N>::type; + +template <typename T, std::size_t N, T val, T... vals> +struct generate_array : generate_array < T, N - 1, val, val, vals... > +{ +}; + +template <typename T, T val, T... vals> +struct generate_array<T, 0, val, vals...> +{ + static constexpr std::array<T, sizeof...(vals)> value{ vals... }; +}; + +template <typename T, T val, T... vals> +constexpr std::array<T, sizeof...(vals)> generate_array<T, 0, val, vals...>::value; /** @endcond */ namespace detail diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h index 9c087eadf1..6836ba3f58 100644 --- a/arm_compute/runtime/CL/functions/CLReduceMean.h +++ b/arm_compute/runtime/CL/functions/CLReduceMean.h @@ -71,7 +71,7 @@ private: std::vector<CLReductionOperation> _reduction_kernels; std::vector<CLTensor> _reduced_outs; CLReshapeLayer _reshape; - unsigned int _reduction_ops; + int _reduction_ops; bool _keep_dims; }; } // namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h index fdd8edfe87..245f7577ce 100644 --- a/arm_compute/runtime/NEON/functions/NEReduceMean.h +++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h @@ -72,7 +72,7 @@ private: std::vector<NEReductionOperation> _reduction_kernels; std::vector<Tensor> _reduced_outs; NEReshapeLayer _reshape; - unsigned int _reduction_ops; + int _reduction_ops; bool _keep_dims; }; } // namespace arm_compute diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index 9c8eaf2733..bcbc818e59 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -236,6 +236,9 @@ If there is more than one release in a month then an extra sequential number is @subsection S2_2_changelog Changelog +v19.08.1 Public maintanance release + - Various bug fixes. + v19.08 Public major release - Various bug fixes. - Various optimisations. @@ -248,9 +251,14 @@ v19.08 Public major release - CLGEMMTranspose1xWKernel / CLGEMMTranspose1xW - CLWidthConcatenateLayer - New NEON kernels / functions: + - @ref NEAbsLayer - @ref NECast + - @ref NEElementwisePower + - @ref NELogLayer - @ref NELSTMLayerQuantized + - @ref NENegLayer - @ref NEPReluLayer + - @ref NESinLayer - @ref NEBatchConcatenateLayerKernel - @ref NEDepthToSpaceLayerKernel / @ref NEDepthToSpaceLayer - @ref NEDepthwiseConvolutionLayerNativeKernel @@ -258,8 +266,13 @@ v19.08 Public major release - @ref NEMeanStdDevNormalizationKernel / @ref NEMeanStdDevNormalizationLayer - @ref NESpaceToDepthLayerKernel / @ref NESpaceToDepthLayer - New OpenCL kernels / functions: + - @ref CLAbsLayer + - @ref CLElementwisePower + - @ref CLLogLayer - @ref CLLSTMLayerQuantized + - @ref CLNegLayer - @ref CLPReluLayer + - @ref CLSinLayer - @ref CLBatchConcatenateLayerKernel - @ref CLDepthToSpaceLayerKernel / @ref CLDepthToSpaceLayer - @ref CLGEMMLowpMatrixMultiplyNativeKernel @@ -271,6 +284,9 @@ v19.08 Public major release - neon_opticalflow - cl_cache - neon_permute + - Added support for FP16 in @ref NEDeconvolutionLayer + - Added support for FP16 in @ref CLDeconvolutionLayer + - Added support for REDUCE_MIN and REDUCE_MAX in @ref ReductionOperation - Enable the fusion of batch normalization with convolution and depthwise convolution layer for FP32 in the graph API (OpenCL only) - Added support for fusing activation function and broadcast addition with the matrix multiplication for FP32 (OpenCL only) - Re-factored the depthwise convolution layer kernel on NEON for generic cases @@ -280,6 +296,7 @@ v19.08 Public major release - The @ref NEDepthwiseConvolutionLayer3x3 will be replaced by @ref NEDepthwiseConvolutionLayerOptimized to accommodate for future optimizations. - Removed inner_border_right and inner_border_top parameters from @ref CLDeconvolutionLayer interface - Removed inner_border_right and inner_border_top parameters from @ref NEDeconvolutionLayer interface + - Optimized the NEON assembly kernel for GEMMLowp. The new implementation fuses the output stage and quantization with the matrix multiplication kernel v19.05 Public major release - Various bug fixes. diff --git a/docs/Doxyfile b/docs/Doxyfile index e9027c85f1..5f3091c492 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "Compute Library" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 19.08 +PROJECT_NUMBER = 19.08.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/release_repository b/release_repository -Subproject 4ba87dbdc3b22220eba4a792c1f5c87e7a88c7a +Subproject 975dfe175e3d7c62c27598b1c0e8e77ed90df46 diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 2d28a496c9..d81ad46b29 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -98,7 +98,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons for(unsigned int n = 0; n < info->num_dimensions(); ++n) { - offset_first_element += window[n].start() * strides[n]; + offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n]; } unsigned int idx_start = idx; diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp index 295fb5c997..177f05f3ca 100644 --- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp +++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -35,7 +35,7 @@ using namespace arm_compute; CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel() - : _input(nullptr), _output(nullptr), _info() + : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN) { } @@ -72,13 +72,14 @@ void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTe { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - _input = input; - _output = output; - _info = info; - // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info)); + _input = input; + _output = output; + _info = info; + _data_layout = input->info()->data_layout(); + // Create kernel CLBuildOptions build_opts; build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); @@ -99,10 +100,8 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const DataLayout data_layout = _input->info()->data_layout(); - - const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); const int out_start_x = _info.pad().first; const int out_end_x = _output->info()->dimension(idx_w) - _info.pad().first + _info.stride().first - 1; @@ -112,7 +111,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu const int out_end_y = _output->info()->dimension(idx_h) - _info.pad().second + _info.stride().second - 1; const int out_step_y = _info.stride().second; - switch(data_layout) + switch(_data_layout) { case DataLayout::NCHW: { diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp index dc4c431c5d..21685dcf0e 100644 --- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -377,7 +377,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel() - : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0) + : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _border_size(0), _conv_stride_x(0), _conv_stride_y(0) { } @@ -390,10 +390,10 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - const DataLayout data_layout = input->info()->data_layout(); - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + _data_layout = input->info()->data_layout(); + const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); const unsigned int kernel_size = weights->info()->dimension(width_idx); const DataType data_type = input->info()->data_type(); @@ -419,11 +419,11 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL _conv_stride_x = std::get<0>(conv_info.stride()); _conv_stride_y = std::get<1>(conv_info.stride()); - if(data_layout == DataLayout::NHWC) + if(_data_layout == DataLayout::NHWC) { _border_size = BorderSize(conv_info.pad_left(), 0, conv_info.pad_right(), 0); } - else if(data_layout == DataLayout::NCHW) + else if(_data_layout == DataLayout::NCHW) { _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); } @@ -441,15 +441,15 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL std::stringstream kernel_name; kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; - if(data_layout == DataLayout::NHWC) + if(_data_layout == DataLayout::NHWC) { - kernel_name << "_" << lower_string(string_from_data_layout(data_layout)); + kernel_name << "_" << lower_string(string_from_data_layout(_data_layout)); } CLBuildOptions build_options; build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS")); - const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout); + const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout); if(run_optimized_for_bifrost) { @@ -466,9 +466,9 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx)))); build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x))); - if(data_layout == DataLayout::NHWC) + if(_data_layout == DataLayout::NHWC) { - const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout); + const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout); build_options.add_option(std::string("-DDATA_LAYOUT_NHWC=1")); build_options.add_option(std::string("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx)))); build_options.add_option(std::string("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx)))); @@ -538,7 +538,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(height_idx)); _config_id += "_"; - _config_id += lower_string(string_from_data_layout(data_layout)); + _config_id += lower_string(string_from_data_layout(_data_layout)); } Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, @@ -562,9 +562,8 @@ void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue win_in.adjust(Window::DimX, -_border_size.left, true); win_in.adjust(Window::DimY, -_border_size.top, true); - const DataLayout data_layout = _input->info()->data_layout(); - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x); win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y); diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp index 10d6e68cd9..24f22c31a5 100644 --- a/src/core/CL/kernels/CLIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -287,7 +287,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size } // namespace CLIm2ColKernel::CLIm2ColKernel() - : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups() + : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups() { } @@ -297,9 +297,10 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups)); - const DataLayout data_layout = input->info()->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + _data_layout = input->info()->data_layout(); + + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); const unsigned int input_width = input->info()->dimension(width_idx); const unsigned int input_height = input->info()->dimension(height_idx); @@ -336,7 +337,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(1)); _config_id += "_"; - _config_id += lower_string(string_from_data_layout(input->info()->data_layout())); + _config_id += lower_string(string_from_data_layout(_data_layout)); } Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, @@ -369,7 +370,7 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue) Window slice_in = first_slice_3d; Window slice_out = window_output.first_slice_window_2D(); - if(_input->info()->data_layout() == DataLayout::NHWC) + if(_data_layout == DataLayout::NHWC) { const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3); const int num_batches = tmp_win[3].end(); diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index 8eaf5bf76f..032d451aad 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -172,7 +172,7 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso } // namespace CLPoolingLayerKernel::CLPoolingLayerKernel() - : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1) + : _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1) { } @@ -185,13 +185,18 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + // Set instance variables + _input = input; + _output = output; + _pool_info = pool_info; + _data_layout = input->info()->data_layout(); + int pool_stride_x = 0; int pool_stride_y = 0; const PoolingType pool_type = pool_info.pool_type(); - DataLayout data_layout = input->info()->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); const int pool_size_x = pool_info.is_global_pooling() ? input->info()->dimension(idx_width) : pool_info.pool_size().width; const int pool_size_y = pool_info.is_global_pooling() ? input->info()->dimension(idx_height) : pool_info.pool_size().height; const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); @@ -218,11 +223,6 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, auto_init(input->info(), output->info(), pool_info); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info)); - // Set instance variables - _input = input; - _output = output; - _pool_info = pool_info; - const DataType data_type = input->info()->data_type(); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); @@ -237,7 +237,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, build_opts.add_option_if(data_type == DataType::F16, "-DFP16"); // Create kernel - switch(data_layout) + switch(_data_layout) { case DataLayout::NCHW: { @@ -286,7 +286,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); ICLKernel::configure_internal(std::get<1>(win_config)); - if(data_layout == DataLayout::NCHW) + if(_data_layout == DataLayout::NCHW) { CLPoolingConfig pooling_config = std::get<2>(win_config); _num_elems_processed_per_iteration = pooling_config.first; @@ -302,7 +302,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, _config_id = "pooling_layer_"; _config_id += lower_string(string_from_data_type(data_type)); _config_id += "_"; - _config_id += lower_string(string_from_data_layout(data_layout)); + _config_id += lower_string(string_from_data_layout(_data_layout)); _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(idx_width)); _config_id += "_"; @@ -333,7 +333,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue) // Collapse window Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - switch(_input->info()->data_layout()) + switch(_data_layout) { case DataLayout::NCHW: { diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp index 488313fd12..82c5c8a446 100644 --- a/src/core/CL/kernels/CLScaleKernel.cpp +++ b/src/core/CL/kernels/CLScaleKernel.cpp @@ -160,11 +160,12 @@ const ICLTensor *CLScaleKernel::output() const void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy) { + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy)); + _input = input; _output = output; _interpolationPolicy = policy; - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy)); + _data_layout = input->info()->data_layout(); float wr = 0.f; float hr = 0.f; @@ -172,10 +173,9 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && policy == InterpolationPolicy::BILINEAR; - DataLayout data_layout = input->info()->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const bool is_nhwc = data_layout == DataLayout::NHWC; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const bool is_nhwc = _data_layout == DataLayout::NHWC; // Compute the ratio between source width/height and destination width/height const unsigned int input_width = input->info()->dimension(idx_width); @@ -215,7 +215,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); std::string kernel_name = "scale_" + interpolation_name; kernel_name += call_quantized_kernel ? "_quantized_" : "_"; - kernel_name += lower_string(string_from_data_layout(data_layout)); + kernel_name += lower_string(string_from_data_layout(_data_layout)); _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters @@ -249,7 +249,7 @@ void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - switch(_input->info()->data_layout()) + switch(_data_layout) { case DataLayout::NCHW: { diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp index ce5ed86332..2ccd540788 100644 --- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp +++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp @@ -37,7 +37,7 @@ namespace arm_compute { CLUpsampleLayerKernel::CLUpsampleLayerKernel() - : _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_input_x() + : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration_input_x() { } @@ -71,13 +71,12 @@ void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, _input = input; _output = output; _info = info; + _data_layout = input->info()->data_layout(); _num_elems_processed_per_iteration_input_x = 1; - const DataLayout data_layout = input->info()->data_layout(); - TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); - output->info()->set_data_layout(data_layout); + output->info()->set_data_layout(_data_layout); unsigned int num_elems_processed_per_iteration_x = 16; const int output_width_x = output->info()->dimension(0); @@ -88,7 +87,7 @@ void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, Window win{}; - switch(data_layout) + switch(_data_layout) { case DataLayout::NCHW: { @@ -140,8 +139,7 @@ void CLUpsampleLayerKernel::run(const Window &window, cl::CommandQueue &queue) Window slice_out = collapsed_window.first_slice_window_3D(); Window slice_in = collapsed_window.first_slice_window_3D(); - DataLayout data_layout = _input->info()->data_layout(); - switch(data_layout) + switch(_data_layout) { case DataLayout::NCHW: slice_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_input_x)); diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp index 1c31ceba99..6125790491 100644 --- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -99,7 +99,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLWinogradInputTransformKernel::CLWinogradInputTransformKernel() - : _border_size(0), _input(nullptr), _output(nullptr), _num_tiles_x(0), _num_tiles_y(0), _step_z(1) + : _border_size(0), _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1) { } @@ -116,16 +116,17 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; - const DataLayout data_layout = input->info()->data_layout(); - const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + _data_layout = input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); // Compute number of elements to process in the X and Y direction const int num_elements_x = input->info()->dimension(idx_w) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right(); const int num_elements_y = input->info()->dimension(idx_h) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom(); - if(data_layout == DataLayout::NCHW) + if(_data_layout == DataLayout::NCHW) { // Check if we need to extend the right or bottom border const unsigned int extra_border_right = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1); @@ -166,7 +167,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL"); build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL"); - if(data_layout == DataLayout::NHWC) + if(_data_layout == DataLayout::NHWC) { build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y)); build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1))); @@ -184,7 +185,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height); // Check optimized kernel if output_dims == 2x2 - if((tile_max_dim == 2) && (data_layout == DataLayout::NCHW)) + if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW)) { _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2; } @@ -192,7 +193,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor // Append stepz and data layout kernel_name += "_stepz"; kernel_name += support::cpp11::to_string(_step_z); - kernel_name += "_" + lower_string(string_from_data_layout(data_layout)); + kernel_name += "_" + lower_string(string_from_data_layout(_data_layout)); _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); @@ -212,7 +213,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor _config_id += "_"; _config_id += support::cpp11::to_string(conv_info.pad_top()); _config_id += "_"; - _config_id += lower_string(string_from_data_layout(input->info()->data_layout())); + _config_id += lower_string(string_from_data_layout(_data_layout)); } Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) @@ -229,11 +230,10 @@ void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - const DataLayout data_layout = _input->info()->data_layout(); - const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const size_t total_batches = window.shape().total_size_upper(3); + const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const size_t total_batches = window.shape().total_size_upper(3); // Collapse window Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); @@ -241,7 +241,7 @@ void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue Window slice = window_collapsed.first_slice_window_3D(); slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1)); slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1)); - if(data_layout == DataLayout::NHWC) + if(_data_layout == DataLayout::NHWC) { slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1)); } diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp index df631c3c03..98b0c106db 100644 --- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp @@ -63,7 +63,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } // namespace NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape() + : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN) { } @@ -80,6 +80,7 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, _input = input; _output = output; _block_shape = block_shape; + _data_layout = input->info()->data_layout(); // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); @@ -99,7 +100,7 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - const int idx_channel = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL); + const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); const int depth_size = _input->info()->dimension(idx_channel); const int r = (depth_size / (_block_shape * _block_shape)); const int element_size = _input->info()->element_size(); @@ -112,7 +113,7 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); // Main loop for NCHW and NHWC - if(_input->info()->data_layout() == DataLayout::NCHW) + if(_data_layout == DataLayout::NCHW) { Window slice_in = window.first_slice_window_2D(); do diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp index 0641d6cfa3..27c3d66b4f 100644 --- a/src/core/NEON/kernels/NEIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -265,10 +265,9 @@ void NEIm2ColKernel::run_im2col(const Window &window) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const DataLayout data_layout = _input->info()->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); const int input_w = _input->info()->dimension(width_idx); const int input_h = _input->info()->dimension(height_idx); @@ -344,7 +343,7 @@ void NEIm2ColKernel::run_im2col(const Window &window) } NEIm2ColKernel::NEIm2ColKernel() - : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U) + : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U), _data_layout(DataLayout::UNKNOWN) { } @@ -355,9 +354,9 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups)); ARM_COMPUTE_UNUSED(num_groups); - const DataLayout data_layout = input->info()->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + _data_layout = input->info()->data_layout(); + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); _input = input; _output = output; @@ -370,7 +369,7 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size _conv_info, _dilation); _has_bias = has_bias; - if(data_layout == DataLayout::NCHW) + if(_data_layout == DataLayout::NCHW) { switch(_input->info()->data_type()) { diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index 62c9ca0d5e..14de4a19d8 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -321,7 +321,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace NEPoolingLayerKernel::NEPoolingLayerKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false) + : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false) { } @@ -364,14 +364,15 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h)); // Set instance variables - _input = input; - _output = output; - _pool_info = pool_info; - _is_square = (pool_size.x() == pool_size.y()); + _input = input; + _output = output; + _pool_info = pool_info; + _data_layout = input->info()->data_layout(); + _is_square = (pool_size.x() == pool_size.y()); // Get data type const DataType data_type = input->info()->data_type(); - const bool is_nchw = data_layout == DataLayout::NCHW; + const bool is_nchw = _data_layout == DataLayout::NCHW; if(data_type == DataType::QASYMM8) { @@ -1574,7 +1575,12 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const // Calculate square-root in case of l2 pooling if(pooling_type == PoolingType::L2) { - vres = vmulq_f32(vres, vinvsqrtq_f32(vres)); + float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 3))) + }; + vres = l2_res; } // Store result @@ -1835,7 +1841,7 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) const bool exclude_padding = _pool_info.exclude_padding(); Window window_input(window); - if(_input->info()->data_layout() == DataLayout::NCHW) + if(_data_layout == DataLayout::NCHW) { // Set step for input in x and y direction for the input unsigned int window_x_inc = 0; diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index ffa4fa3565..16cd6f77b4 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -742,23 +742,8 @@ struct RedOpYZW for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) { - T *in_ptr; - switch(axis) - { - case 1: - in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim))); - break; - case 2: - in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim))); - break; - case 3: - in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim))); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } + const T *in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.strides_in_bytes()[axis] * dim); const auto vec_elements = wrapper::vloadq(in_ptr); - switch(op) { case ReductionOperation::SUM: @@ -907,23 +892,8 @@ struct RedOpYZW_qasymm8 for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) { - uint8_t *in_ptr; - switch(axis) - { - case 1: - in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim)); - break; - case 2: - in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim)); - break; - case 3: - in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim)); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } - const auto vec_elements = wrapper::vloadq(in_ptr); - + const uint8_t *in_ptr = input.ptr() + in_info.strides_in_bytes()[axis] * index_dim; + const auto vec_elements = wrapper::vloadq(in_ptr); switch(op) { case ReductionOperation::SUM: diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp index a2a44fca18..5b8e196a2c 100644 --- a/src/core/NEON/kernels/NEScaleKernel.cpp +++ b/src/core/NEON/kernels/NEScaleKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -909,7 +909,7 @@ void NEScaleKernel::scale_area_nchw(const Window &window) void NEScaleKernel::scale_nhwc(const Window &window) { // Get data layout and width/height indices - const DataLayout data_layout = _input->info()->data_layout(); + const DataLayout data_layout = DataLayout::NHWC; const int idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp index 4803365013..ffd2dc14bf 100644 --- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp +++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp @@ -66,7 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } // namespace NESpaceToDepthLayerKernel::NESpaceToDepthLayerKernel() - : _input(nullptr), _output(nullptr), _block_shape() + : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN) { } @@ -82,6 +82,7 @@ void NESpaceToDepthLayerKernel::configure(const ITensor *input, ITensor *output, _input = input; _block_shape = block_shape; _output = output; + _data_layout = input->info()->data_layout(); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -100,9 +101,8 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - const DataLayout data_layout = _input->info()->data_layout(); - const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const int element_size = _input->info()->element_size(); + const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const int element_size = _input->info()->element_size(); const size_t channel_size = _input->info()->dimension(channel_idx); @@ -111,7 +111,7 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_output->info()->data_layout() == DataLayout::NCHW) + if(_data_layout == DataLayout::NCHW) { do { diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp index a3634cd46e..c5de43da35 100644 --- a/src/runtime/CL/functions/CLReduceMean.cpp +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -26,20 +26,81 @@ #include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h" +#include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "support/ToolchainSupport.h" namespace arm_compute { +namespace +{ +Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + { + //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions()))); + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions())); + } + + if(output->tensor_shape().total_size() != 0) + { + // Only validate if not using auto_init for the output tensor + TensorShape out_shape = input->tensor_shape(); + // Validate output_shape only if not using auto_init + convert_negative_axis(axis_local, input_dims); + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for(unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1); + if(output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if(keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i])); + const unsigned int remove_index = axis_local[i] - i; + ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions()); + out_shape.remove_dimension(remove_index); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + } + return Status{}; +} +} CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims() { } void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input); + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); + // Output auto inizialitation if not yet initialized + const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); _reduction_ops = reduction_axis.num_dimensions(); _reduction_kernels.resize(_reduction_ops); @@ -49,14 +110,10 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis Coordinates axis_local = reduction_axis; const int input_dims = input->info()->num_dimensions(); - // Convert negative axis - for(unsigned int i = 0; i < _reduction_ops; ++i) - { - axis_local[i] = wrap_around(axis_local[i], input_dims); - } + convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(unsigned int i = 0; i < _reduction_ops; ++i) + for(int i = 0; i < _reduction_ops; ++i) { TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); @@ -75,7 +132,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis } // Allocate intermediate tensors - for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } @@ -88,7 +145,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis // We have to sort the reduction axis vectors in order for remove_dimension // to work properly std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(unsigned int i = 0; i < _reduction_ops; ++i) + for(int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i); } @@ -99,55 +156,16 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); - - TensorShape out_shape = input->tensor_shape(); - - Coordinates axis_sorted = reduction_axis; - const unsigned int reduction_ops = reduction_axis.num_dimensions(); - const int input_dims = input->num_dimensions(); - - // Convert negative axis - for(unsigned int i = 0; i < reduction_ops; ++i) - { - axis_sorted[i] = wrap_around(axis_sorted[i], input_dims); - } - - std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3); - ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) - { - ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1); - } - if(keep_dims) - { - out_shape.set(axis_sorted[i], 1); - } - else - { - out_shape.remove_dimension(axis_sorted[i] - i); - } - } - - const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - - return Status{}; + return validate_config(input, reduction_axis, keep_dims, output); } void CLReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - for(unsigned int i = 0; i < _reduction_ops; ++i) + for(auto &kernel : _reduction_kernels) { - _reduction_kernels[i].run(); + kernel.run(); } if(!_keep_dims) diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp index 0b145f034d..96ec8b8587 100644 --- a/src/runtime/NEON/functions/NEReduceMean.cpp +++ b/src/runtime/NEON/functions/NEReduceMean.cpp @@ -24,80 +24,97 @@ #include "arm_compute/runtime/NEON/functions/NEReduceMean.h" #include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -using namespace arm_compute; +namespace arm_compute +{ NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims() { } -Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); - TensorShape out_shape = input->tensor_shape(); const unsigned int reduction_ops = reduction_axis.num_dimensions(); const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - // Convert negative axis - for(unsigned int i = 0; i < reduction_ops; ++i) + for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { - axis_local[i] = wrap_around(axis_local[i], input_dims); + //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions()))); + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions())); } - std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) + if(output->tensor_shape().total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); - ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) - { - ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); - } - if(keep_dims) - { - out_shape.set(axis_local[i], 1); - } - else + // Only validate if not using auto_init for the output tensor + TensorShape out_shape = input->tensor_shape(); + // Validate output_shape only if not using auto_init + convert_negative_axis(axis_local, input_dims); + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for(unsigned int i = 0; i < reduction_ops; ++i) { - out_shape.remove_dimension(axis_local[i] - i); + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1); + if(output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if(keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i])); + const unsigned int remove_index = axis_local[i] - i; + ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions()); + out_shape.remove_dimension(remove_index); + } } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); } - const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); - return Status{}; } +Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +{ + return validate_config(input, reduction_axis, keep_dims, output); +} + void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input); + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); + // Output auto inizialitation if not yet initialized + const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); _reduction_ops = reduction_axis.num_dimensions(); _reduction_kernels.resize(_reduction_ops); _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); _keep_dims = keep_dims; - Coordinates axis_local = reduction_axis; - const int input_dims = input->info()->num_dimensions(); - const unsigned int reduction_ops = reduction_axis.num_dimensions(); + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); - // Convert negative axis - for(unsigned int i = 0; i < reduction_ops; ++i) - { - axis_local[i] = wrap_around(axis_local[i], input_dims); - } + convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(unsigned int i = 0; i < _reduction_ops; ++i) + for(int i = 0; i < _reduction_ops; ++i) { TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); @@ -116,7 +133,7 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, } // Allocate intermediate tensors - for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } @@ -125,11 +142,10 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, if(!keep_dims) { TensorShape out_shape = input->info()->tensor_shape(); - // We have to sort the reduction axis vectors in order for remove_dimension // to work properly std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(unsigned int i = 0; i < _reduction_ops; ++i) + for(int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i); } @@ -141,10 +157,9 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, void NEReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - - for(unsigned int i = 0; i < _reduction_ops; ++i) + for(auto &kernel : _reduction_kernels) { - _reduction_kernels[i].run(); + kernel.run(); } if(!_keep_dims) @@ -152,3 +167,5 @@ void NEReduceMean::run() _reshape.run(); } } + +} // namespace arm_compute diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h index 07ecf45d81..b479eb4953 100644 --- a/tests/datasets/ShapeDatasets.h +++ b/tests/datasets/ShapeDatasets.h @@ -203,7 +203,9 @@ public: TensorShape{ 128U, 1U, 5U, 3U }, TensorShape{ 9U, 9U, 3U, 4U }, TensorShape{ 27U, 13U, 2U, 4U }, - TensorShape{ 1U, 1U, 1U, 5U } + TensorShape{ 1U, 1U, 1U, 5U }, + TensorShape{ 1U, 16U, 10U, 2U, 128U }, + TensorShape{ 1U, 16U, 10U, 2U, 128U } }), ShapeDataset("Shape1", { @@ -212,7 +214,9 @@ public: TensorShape{ 128U, 64U, 1U, 3U }, TensorShape{ 9U, 1U, 3U }, TensorShape{ 1U }, - TensorShape{ 9U, 9U, 3U, 5U } + TensorShape{ 9U, 9U, 3U, 5U }, + TensorShape{ 1U, 1U, 1U, 1U, 128U }, + TensorShape{ 128U } })) { } @@ -686,7 +690,7 @@ public: : ShapeDataset("InputShape", { // Batch size 1 - TensorShape{ 32U, 37U, 3U }, + TensorShape{ 32U, 37U, 3U }, // Batch size 4 TensorShape{ 32U, 37U, 3U, 4U }, }) @@ -702,7 +706,7 @@ public: : ShapeDataset("InputShape", { // Batch size 1 - TensorShape{ 32U, 37U, 3U }, + TensorShape{ 32U, 37U, 3U }, // Batch size 4 TensorShape{ 32U, 37U, 3U, 4U }, // Arbitrary batch size diff --git a/tests/validation/CL/ReduceMean.cpp b/tests/validation/CL/ReduceMean.cpp index cfd4a2730c..1b7400bf53 100644 --- a/tests/validation/CL/ReduceMean.cpp +++ b/tests/validation/CL/ReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,20 +55,26 @@ TEST_SUITE(ReduceMean) // *INDENT-OFF* // clang-format off -DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid axis TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid output shape - TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32) + TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32),// OK + TensorInfo(TensorShape{228U, 19U, 2U, 2U}, 1, DataType::F32),// OK + TensorInfo(TensorShape{228U, 19U, 2U, 1U}, 1, DataType::F32) // Cannot support axis 3 not valid }), framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32), TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32) + TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(19U), 1, DataType::F32), + TensorInfo(TensorShape(19U), 1, DataType::F32) + })), - framework::dataset::make("Axis", { Coordinates(4), Coordinates(0,2), Coordinates(2) })), - framework::dataset::make("Expected", { false, false, true })), - input_info, output_info, axis, expected) + framework::dataset::make("Axis", { Coordinates(4), Coordinates(0,2), Coordinates(2), Coordinates(3,2,0), Coordinates(3,2,0) })), + framework::dataset::make("Keep", { true, true, true, false, false })), + framework::dataset::make("Expected", { false, false, true, true, false })), + input_info, output_info, axis, keep, expected) { - const Status status = CLReduceMean::validate(&input_info.clone()->set_is_resizable(false), axis, true, &output_info.clone()->set_is_resizable(false)); + const Status status = CLReduceMean::validate(&input_info.clone()->set_is_resizable(false), axis, keep, &output_info.clone()->set_is_resizable(false)); ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); } // clang-format on diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp index 3cd7ce362e..6d0caf7160 100644 --- a/tests/validation/NEON/ReduceMean.cpp +++ b/tests/validation/NEON/ReduceMean.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -57,20 +57,26 @@ TEST_SUITE(ReduceMean) // *INDENT-OFF* // clang-format off -DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid axis TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid output shape - TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32) + TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32),// OK + TensorInfo(TensorShape{228U, 19U, 2U, 2U}, 1, DataType::F32),// OK + TensorInfo(TensorShape{228U, 19U, 2U, 1U}, 1, DataType::F32) // Cannot support axis 3 not valid }), framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32), TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32) + TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(19U), 1, DataType::F32), + TensorInfo(TensorShape(19U), 1, DataType::F32) + })), - framework::dataset::make("Axis", { Coordinates(4), Coordinates(0,2), Coordinates(2) })), - framework::dataset::make("Expected", { false, false, true })), - input_info, output_info, axis, expected) + framework::dataset::make("Axis", { Coordinates(4), Coordinates(0,2), Coordinates(2), Coordinates(3,2,0), Coordinates(3,2,0) })), + framework::dataset::make("Keep", { true, true, true, false, false })), + framework::dataset::make("Expected", { false, false, true, true, false })), + input_info, output_info, axis, keep, expected) { - const Status status = NEReduceMean::validate(&input_info.clone()->set_is_resizable(false), axis, true, &output_info.clone()->set_is_resizable(false)); + const Status status = NEReduceMean::validate(&input_info.clone()->set_is_resizable(false), axis, keep, &output_info.clone()->set_is_resizable(false)); ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); } // clang-format on |