From 0021d750d66d199c411df00cdd8308c325f1fef3 Mon Sep 17 00:00:00 2001 From: Diego Lopez Recas Date: Mon, 18 Dec 2017 14:42:56 +0000 Subject: IVGCVSW-863 Broadcast support in CL/NEON Arithmetic Add Also, added instrumentation to support generic tensor broadcasting for NEON and CL backends. Change-Id: I1bc5747a286e1a4b464c209067581e103d473b9a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114201 Reviewed-by: Anthony Barbier Tested-by: Jenkins --- arm_compute/core/CL/ICLKernel.h | 124 +++++++++++++-------- .../core/CL/kernels/CLArithmeticAdditionKernel.h | 3 +- arm_compute/core/Dimensions.h | 27 ++--- arm_compute/core/Helpers.inl | 14 ++- arm_compute/core/IAccessWindow.h | 2 +- arm_compute/core/ITensorInfo.h | 46 ++++++++ .../core/NEON/kernels/NEArithmeticAdditionKernel.h | 1 + arm_compute/core/TensorShape.h | 58 ++++++++++ arm_compute/core/Window.h | 57 +++++++++- arm_compute/core/Window.inl | 76 +++++++------ 10 files changed, 305 insertions(+), 103 deletions(-) (limited to 'arm_compute/core') diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h index a1bc3eb8d2..e660ae55a0 100644 --- a/arm_compute/core/CL/ICLKernel.h +++ b/arm_compute/core/CL/ICLKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -41,14 +41,40 @@ class Window; /** Common interface for all the OpenCL kernels */ class ICLKernel : public IKernel { +private: + /** Returns the number of arguments enqueued per array object. + * + * @return The number of arguments enqueued per array object. + */ + template + constexpr static unsigned int num_arguments_per_array() + { + return num_arguments_per_tensor(); + } + /** Returns the number of arguments enqueued per tensor object. + * + * @return The number of arguments enqueued per tensor object. + */ + template + constexpr static unsigned int num_arguments_per_tensor() + { + return 2 + 2 * dimension_size; + } + public: /** Constructor */ - ICLKernel(); + ICLKernel() + : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0) + { + } /** Returns a reference to the OpenCL kernel of this object. * * @return A reference to the OpenCL kernel of this object. */ - cl::Kernel &kernel(); + cl::Kernel &kernel() + { + return _kernel; + } /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set. @@ -58,60 +84,90 @@ public: * @param[in] window Window the kernel will be executed on. */ template - void add_1D_array_argument(unsigned int &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window); + void add_1D_array_argument(unsigned int &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window) + { + add_array_argument(idx, array, strides, num_dimensions, window); + } /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. * @param[in] tensor Tensor to set as an argument of the object's kernel. * @param[in] window Window the kernel will be executed on. */ - void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); + void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) + { + add_tensor_argument<1>(idx, tensor, window); + } /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. * @param[in] tensor Tensor to set as an argument of the object's kernel. * @param[in] window Window the kernel will be executed on. */ - void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); + void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) + { + add_tensor_argument<2>(idx, tensor, window); + } /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. * @param[in] tensor Tensor to set as an argument of the object's kernel. * @param[in] window Window the kernel will be executed on. */ - void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); + void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) + { + add_tensor_argument<3>(idx, tensor, window); + } /** Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. * @param[in] tensor Tensor to set as an argument of the object's kernel. * @param[in] window Window the kernel will be executed on. */ - void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); + void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) + { + add_tensor_argument<4>(idx, tensor, window); + } /** Returns the number of arguments enqueued per 1D array object. * * @return The number of arguments enqueues per 1D array object. */ - unsigned int num_arguments_per_1D_array() const; + constexpr static unsigned int num_arguments_per_1D_array() + { + return num_arguments_per_array<1>(); + } /** Returns the number of arguments enqueued per 1D tensor object. * * @return The number of arguments enqueues per 1D tensor object. */ - unsigned int num_arguments_per_1D_tensor() const; + constexpr static unsigned int num_arguments_per_1D_tensor() + { + return num_arguments_per_tensor<1>(); + } /** Returns the number of arguments enqueued per 2D tensor object. * * @return The number of arguments enqueues per 2D tensor object. */ - unsigned int num_arguments_per_2D_tensor() const; + constexpr static unsigned int num_arguments_per_2D_tensor() + { + return num_arguments_per_tensor<2>(); + } /** Returns the number of arguments enqueued per 3D tensor object. * * @return The number of arguments enqueues per 3D tensor object. */ - unsigned int num_arguments_per_3D_tensor() const; + constexpr static unsigned int num_arguments_per_3D_tensor() + { + return num_arguments_per_tensor<3>(); + } /** Returns the number of arguments enqueued per 4D tensor object. * * @return The number of arguments enqueues per 4D tensor object. */ - unsigned int num_arguments_per_4D_tensor() const; + constexpr static unsigned int num_arguments_per_4D_tensor() + { + return num_arguments_per_tensor<4>(); + } /** Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue. * * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns. @@ -161,7 +217,10 @@ public: * * @param[in] target The targeted GPU architecture */ - void set_target(GPUTarget target); + void set_target(GPUTarget target) + { + _target = target; + } /** Set the targeted GPU architecture according to the CL device * @@ -173,7 +232,10 @@ public: * * @return The targeted GPU architecture. */ - GPUTarget get_target() const; + GPUTarget get_target() const + { + return _target; + } /** Get the maximum workgroup size for the device the CLKernelLibrary uses. * @@ -207,18 +269,6 @@ private: */ template void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); - /** Returns the number of arguments enqueued per array object. - * - * @return The number of arguments enqueued per array object. - */ - template - unsigned int num_arguments_per_array() const; - /** Returns the number of arguments enqueued per tensor object. - * - * @return The number of arguments enqueued per tensor object. - */ - template - unsigned int num_arguments_per_tensor() const; protected: cl::Kernel _kernel; /**< OpenCL kernel to run */ @@ -246,6 +296,8 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c template void ICLKernel::add_array_argument(unsigned &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window) { + ARM_COMPUTE_ERROR_ON(array == nullptr); + // Calculate offset to the start of the window unsigned int offset_first_element = 0; @@ -269,23 +321,5 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray *array, cons "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array()); ARM_COMPUTE_UNUSED(idx_start); } - -template -void ICLKernel::add_1D_array_argument(unsigned int &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window) -{ - add_array_argument(idx, array, strides, num_dimensions, window); -} - -template -unsigned int ICLKernel::num_arguments_per_array() const -{ - return num_arguments_per_tensor(); -} - -template -unsigned int ICLKernel::num_arguments_per_tensor() const -{ - return 2 + 2 * dimension_size; -} } #endif /*__ARM_COMPUTE_ICLKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h index 96b8dc8d48..5112476aae 100644 --- a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h +++ b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -72,6 +72,7 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; private: const ICLTensor *_input1; /**< Source tensor 1 */ diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h index ae8d6c3503..58ffd7ff3c 100644 --- a/arm_compute/core/Dimensions.h +++ b/arm_compute/core/Dimensions.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -135,23 +135,24 @@ public: * @param[in] n Number of dimensions to collapse into @p first. * @param[in] first Dimensions into which the following @p n are collapsed. */ - void collapse(size_t n, size_t first = 0) + void collapse(const size_t n, const size_t first = 0) { ARM_COMPUTE_ERROR_ON(first + n > _id.size()); - if(n == 0) + const size_t last = std::min(_num_dimensions, first + n); + + if(last > (first + 1)) { - return; + // Collapse dimensions into the first + _id[first] = std::accumulate(&_id[first], &_id[last], 1, std::multiplies()); + // Shift the remaining dimensions down + std::copy(&_id[last], &_id[_num_dimensions], &_id[first + 1]); + // Reduce the number of dimensions + const size_t old_num_dimensions = _num_dimensions; + _num_dimensions -= last - first - 1; + // Fill the now empty dimensions with zero + std::fill(&_id[_num_dimensions], &_id[old_num_dimensions], 0); } - - // Collapse dimensions into the first - _id[first] = std::accumulate(_id.cbegin() + first, _id.cbegin() + first + n, 1, std::multiplies()); - // Shift the remaining dimensions down - std::copy(_id.begin() + first + n, _id.end(), _id.begin() + first + 1); - // Reduce the number of dimensions - _num_dimensions -= std::min(n, _num_dimensions) - 1; - // Fill the now empty dimensions with zero - std::fill(_id.begin() + _num_dimensions, _id.end(), 0); } /** Collapse dimensions starting from a given point diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl index 6d0f8b0104..8b86c22676 100644 --- a/arm_compute/core/Helpers.inl +++ b/arm_compute/core/Helpers.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2018 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -123,6 +123,11 @@ inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... i { w.validate(); + for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_ERROR_ON(w[i].step() == 0); + } + Coordinates id; ForEachDimension::unroll(w, id, std::forward(lambda_function), std::forward(iterators)...); } @@ -136,9 +141,10 @@ inline Iterator::Iterator(const ITensor *tensor, const Window &win) : Iterator() { ARM_COMPUTE_ERROR_ON(tensor == nullptr); - const ITensorInfo *info = tensor->info(); - ARM_COMPUTE_ERROR_ON(info == nullptr); - const Strides &strides = info->strides_in_bytes(); + ARM_COMPUTE_ERROR_ON(tensor->info() == nullptr); + + const ITensorInfo *info = tensor->info(); + const Strides &strides = info->strides_in_bytes(); _ptr = tensor->buffer() + info->offset_first_element_in_bytes(); diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h index 583041a48b..4bbcbb3a40 100644 --- a/arm_compute/core/IAccessWindow.h +++ b/arm_compute/core/IAccessWindow.h @@ -139,8 +139,8 @@ public: } AccessWindowRectangle(const AccessWindowRectangle &) = delete; + AccessWindowRectangle(AccessWindowRectangle &&) = delete; AccessWindowRectangle &operator=(const AccessWindowRectangle &) = delete; - AccessWindowRectangle(AccessWindowRectangle &&) = default; AccessWindowRectangle &operator=(AccessWindowRectangle &&) = default; ~AccessWindowRectangle() = default; diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h index 9112f3ea18..b5677dffd6 100644 --- a/arm_compute/core/ITensorInfo.h +++ b/arm_compute/core/ITensorInfo.h @@ -30,6 +30,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ICloneable.h" +#include "arm_compute/core/utils/misc/utility.h" #include @@ -221,6 +222,51 @@ public: * @return A QuantizationInfo containing the scale and offset. */ virtual QuantizationInfo quantization_info() const = 0; + + /** If infos are broadcast compatible tensor info's, return the broadcasted shape and the intersection of + * the broadcasted valid regions of the tensors. + * + * Two tensor info's are broadcast compatible if their shapes are broadcast compatible. + * + * Two tensor shapes are broadcast compatible if for each dimension, they're equal or one of them is 1. + * + * If two shapes are compatible, each dimension in the broadcasted shape is the max of the original dimensions. + * + * @param[in] infos Tensor info's. + * + * @return The broadcasted shape and valid region, or an empty shape and valid region if the info's are + * not broadcast compatible. + */ + template + static std::pair broadcast_shape_and_valid_region(const Infos &... infos) + { + TensorShape bc_shape = TensorShape::broadcast_shape(infos.tensor_shape()...); + ValidRegion bc_valid_region{ Coordinates(), bc_shape }; + + auto broadcast_valid_region = [&bc_valid_region](const ITensorInfo & info) + { + if(info.num_dimensions() != 0) + { + for(size_t d = 0; d < bc_valid_region.shape.num_dimensions(); ++d) + { + const bool is_broadcast = (info.tensor_shape()[d] == 1); + + const int anchor_max = std::max(bc_valid_region.anchor[d], info.valid_region().anchor[d]); + const size_t valid_min = std::min(bc_valid_region.shape[d], info.valid_region().shape[d]); + + if(!is_broadcast || (valid_min == 0)) + { + bc_valid_region.anchor.set(d, anchor_max); + bc_valid_region.shape.set(d, valid_min); + } + } + } + }; + + utility::for_each(broadcast_valid_region, infos...); + + return std::pair(bc_shape, bc_valid_region); + } }; } #endif /*__ARM_COMPUTE_TENSORINFO_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h index eedecfb524..155e792f5d 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h @@ -85,6 +85,7 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; private: /** Common signature for all the specialised add functions diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h index 50f1211c18..dc836c98da 100644 --- a/arm_compute/core/TensorShape.h +++ b/arm_compute/core/TensorShape.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Dimensions.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/misc/utility.h" #include #include @@ -132,6 +133,19 @@ public: std::fill(_id.begin() + _num_dimensions, _id.end(), 1); } + /** Return a copy with collapsed dimensions starting from a given point. + * + * @param[in] start Starting point of collapsing dimensions. + * + * @return A copy with collapse dimensions starting from start. + */ + TensorShape collapsed_from(size_t start) const + { + TensorShape copy(*this); + copy.collapse(num_dimensions(), start); + return copy; + } + /** Collapses all dimensions to a single linear total size. * * @return The total tensor size in terms of elements. @@ -164,6 +178,50 @@ public: return std::accumulate(_id.begin(), _id.begin() + dimension, 1, std::multiplies()); } + /** If shapes are broadcast compatible, return the broadcasted shape. + * + * Two tensor shapes are broadcast compatible if for each dimension, they're equal or one of them is 1. + * + * If two shapes are compatible, each dimension in the broadcasted shape is the max of the original dimensions. + * + * @param[in] shapes Tensor shapes. + * + * @return The broadcasted shape or an empty shape if the shapes are not broadcast compatible. + */ + template + static TensorShape broadcast_shape(const Shapes &... shapes) + { + TensorShape bc_shape; + + auto broadcast = [&bc_shape](const TensorShape & other) + { + if(bc_shape.num_dimensions() == 0) + { + bc_shape = other; + } + else if(other.num_dimensions() != 0) + { + for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d) + { + const size_t dim_min = std::min(bc_shape[d], other[d]); + const size_t dim_max = std::max(bc_shape[d], other[d]); + + if((dim_min != 1) && (dim_min != dim_max)) + { + bc_shape = TensorShape{ 0U }; + break; + } + + bc_shape.set(d, dim_max); + } + } + }; + + utility::for_each(broadcast, shapes...); + + return bc_shape; + } + private: /** Remove trailing dimensions of size 1 from the reported number of dimensions. */ void apply_dimension_correction() diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h index c890bf8f9e..cca12c9efe 100644 --- a/arm_compute/core/Window.h +++ b/arm_compute/core/Window.h @@ -104,6 +104,14 @@ public: { _step = step; } + /** Set the dimension's end + * + * @param[in] end The new end + */ + void set_end(int end) + { + _end = end; + } private: int _start; /**< Start of the dimension */ @@ -302,27 +310,64 @@ public: return slide_window_slice<4>(slice); } + /* Collapse the dimensions between @p first and @p last if possible. + * + * A dimension is collapsable if it starts from 0 and matches the corresponding dimension in the full_window + * + * @param[in] full_window Full window @p window has been created from. + * @param[in] first Start dimension into which the following are collapsed. + * @param[in] last End (exclusive) dimension to collapse. + * @param[out] has_collapsed (Optional) Whether the window was collapsed. + * + * @return Collapsed window. + */ + Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed = nullptr) const; + /* Collapse the dimensions higher than @p first if possible. * * A dimension is collapsable if it starts from 0 and matches the corresponding dimension in the full_window * - * @param[in] full_window Full window @p window has been created from. - * @param[in] first Dimensions into which the following are collapsed. + * @param[in] full_window Full window @p window has been created from. + * @param[in] first Start dimension into which the following are collapsed. + * @param[out] has_collapsed (Optional) Whether the window was collapsed. * * @return Collapsed window. */ - Window collapse_if_possible(const Window &full_window, size_t first) const; + Window collapse_if_possible(const Window &full_window, size_t first, bool *has_collapsed = nullptr) const + { + return collapse_if_possible(full_window, first, Coordinates::num_max_dimensions, has_collapsed); + } - /* Collapse the dimensions higher than @p first. + /* Collapse the dimensions between @p first and @p last. * * A dimension is collapsable if it starts from 0 and matches the corresponding dimension in the full_window * * @param[in] full_window Full window @p window has been created from. - * @param[in] first Dimensions into which the following are collapsed. + * @param[in] first Start dimension into which the following are collapsed. + * @param[in] last End (exclusive) dimension to collapse. * * @return Collapsed window if successful. */ - Window collapse(const Window &full_window, size_t first) const; + Window collapse(const Window &full_window, size_t first, size_t last = Coordinates::num_max_dimensions) const; + + /* Don't advance in the dimension where @p shape is less equal to 1. + * + * @param[in] shape A TensorShape. + * + * @return Broadcast window. + */ + Window broadcast_if_dimension_le_one(const TensorShape &shape) const; + + /* Don't advance in the dimension where shape of @p info is less equal to 1. + * + * @param[in] info An ITensorInfo. + * + * @return Broadcast window. + */ + Window broadcast_if_dimension_le_one(const ITensorInfo &info) const + { + return broadcast_if_dimension_le_one(info.tensor_shape()); + } private: /** First slice of the window diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl index 1b21820f90..23b2a8e322 100644 --- a/arm_compute/core/Window.inl +++ b/arm_compute/core/Window.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,55 +37,66 @@ inline constexpr const Window::Dimension &Window::operator[](size_t dimension) c // Precondition: dimension < Coordinates::num_max_dimensions return _dims.at(dimension); } + inline void Window::set(size_t dimension, const Window::Dimension &dim) { ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions); _dims[dimension] = dim; } -inline Window Window::collapse_if_possible(const Window &full_window, size_t first) const +inline Window Window::collapse_if_possible(const Window &full_window, const size_t first, + const size_t last, bool *has_collapsed) const { - bool is_collapsable = false; - Window collapsed; - for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d) + Window collapsed(*this); + + bool is_collapsable = true; + int collapsed_end = _dims[first].end(); + + for(size_t d = first + 1; is_collapsable && (d < last); ++d) { - if(is_collapsable) - { - collapsed.set(first, Window::Dimension(collapsed[first].end() * _dims[d].start(), collapsed[first].end() * _dims[d].end())); - } - else - { - collapsed.set(d, _dims[d]); - } + // The _dims's dimension must match the full _dims dimension to be collapsable: + is_collapsable = (_dims[d].start() == 0) && (full_window[d].start() == 0) && (_dims[d].step() <= 1) + && (full_window[d].end() == _dims[d].end()); + collapsed_end *= _dims[d].end(); + } - if(is_collapsable || d == first) // Try to start collapsing from this dimension - { - // The _dims's dimension must match the full _dims dimension to be collapsable: - is_collapsable = _dims[d].start() == 0 && _dims[d].start() == full_window[d].start() - && full_window[d].end() == _dims[d].end(); - } - else + if(is_collapsable) + { + collapsed._dims.at(first).set_end(collapsed_end); + for(size_t d = first + 1; is_collapsable && (d < last); ++d) { - is_collapsable = false; + collapsed.set(d, Dimension()); } } + + if(has_collapsed != nullptr) + { + *has_collapsed = is_collapsable; + } + return collapsed; } -inline Window Window::collapse(const Window &full_window, size_t first) const +inline Window Window::collapse(const Window &full_window, const size_t first, const size_t last) const { - Window collapsed = collapse_if_possible(full_window, first); + bool has_collapsed = false; + Window collapsed = collapse_if_possible(full_window, first, last, &has_collapsed); // Make sure that the window has collapsed - int end = _dims[first].end(); - int start = 0; - ARM_COMPUTE_UNUSED(start); - for(size_t d = first + 1; d < Coordinates::num_max_dimensions; ++d) + ARM_COMPUTE_ERROR_ON(!has_collapsed); + return collapsed; +} + +inline Window Window::broadcast_if_dimension_le_one(const TensorShape &shape) const +{ + Window broadcastWin(*this); + for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d) { - start = end * _dims[d].start(); - end *= _dims[d].end(); + if(shape[d] <= 1) + { + broadcastWin.set(d, Dimension(0, 0, 0)); + } } - ARM_COMPUTE_ERROR_ON((collapsed[first].end() != end) || (collapsed[first].start() != start)); - return collapsed; + return broadcastWin; } inline void Window::shift(size_t dimension, int shift_value) @@ -129,9 +140,8 @@ inline void Window::validate() const { for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i) { - ARM_COMPUTE_ERROR_ON(_dims[i].step() == 0); ARM_COMPUTE_ERROR_ON(_dims[i].end() < _dims[i].start()); - ARM_COMPUTE_ERROR_ON((_dims[i].end() - _dims[i].start()) % _dims[i].step()); + ARM_COMPUTE_ERROR_ON((_dims[i].step() != 0) && (((_dims[i].end() - _dims[i].start()) % _dims[i].step()) != 0)); } } -- cgit v1.2.1