From 7b9998d0fe1f98768b690ead10ebfa166d1b873d Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Mon, 21 Oct 2019 17:59:07 +0100 Subject: COMPMID-1816: Use parallel reduction on 0 axis in CL ARG_MIN/ARG_MAX Introducing new CLArgMinMax kernel Change-Id: I0b8254207cc3859d19ceef9b6429cf5c1c586db0 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/2202 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou --- arm_compute/core/CL/CLHelpers.h | 11 +++ arm_compute/core/CL/CLKernels.h | 7 +- .../core/CL/kernels/CLArgMinMaxLayerKernel.h | 94 ++++++++++++++++++++++ .../core/CL/kernels/CLReductionOperationKernel.h | 19 ++--- .../runtime/CL/functions/CLArgMinMaxLayer.h | 27 ++++--- .../runtime/CL/functions/CLReductionOperation.h | 10 +-- arm_compute/runtime/Utils.h | 13 ++- 7 files changed, 148 insertions(+), 33 deletions(-) create mode 100644 arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h (limited to 'arm_compute') diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index cd65eafc9c..7e549be989 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -190,5 +190,16 @@ bool preferred_dummy_work_items_support(const cl::Device &device); * @return An opencl kernel */ cl::Kernel create_opencl_kernel(CLCoreRuntimeContext *ctx, const std::string &kernel_name, const CLBuildOptions &build_opts); + +/** Creates a suitable LWS hint object for parallel implementations. Sets the number of WG based on the input size. + * If input width is smaller than 128 we can use fewer threads than 8. + * + * @param[in] input_dimension number of elements along the dimension to apply the parallellization + * @param[in] vector_size size of the vector in OpenCL + * + * @return An LWS hint object + */ +cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size); + } // namespace arm_compute #endif /* __ARM_COMPUTE_CLHELPERS_H__ */ diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index c3c485db7c..78437beffb 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -21,13 +21,14 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __ARM_COMPUTE_CLKERNELS_H__ -#define __ARM_COMPUTE_CLKERNELS_H__ +#ifndef ARM_COMPUTE_CLKERNELS_H +#define ARM_COMPUTE_CLKERNELS_H /* Header regrouping all the CL kernels */ #include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h" #include "arm_compute/core/CL/kernels/CLAccumulateKernel.h" #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" +#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h" #include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h" #include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h" #include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h" @@ -160,4 +161,4 @@ #include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h" #include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h" -#endif /* __ARM_COMPUTE_CLKERNELS_H__ */ +#endif /* ARM_COMPUTE_CLKERNELS_H */ diff --git a/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h b/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h new file mode 100644 index 0000000000..7f4cfe3edc --- /dev/null +++ b/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H +#define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the reduction operation kernel + * + * @note The default data type for an uninitialized output tensor is + * signed 32-bit integer (S32). It is the user's responsibility to check + * that the results do not overflow because the indices are computed + * in unsigned 32-bit (U32). + */ +class CLArgMinMaxLayerKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLArgMinMaxLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArgMinMaxLayerKernel(const CLArgMinMaxLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArgMinMaxLayerKernel &operator=(const CLArgMinMaxLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + CLArgMinMaxLayerKernel(CLArgMinMaxLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + CLArgMinMaxLayerKernel &operator=(CLArgMinMaxLayerKernel &&) = default; + /** Default destructor */ + ~CLArgMinMaxLayerKernel() = default; + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: S32/F16/F32. + * @param[in] prev_output Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32 + * Has to be nullptr for the first iteration + * @param[out] output Destination tensor. Data types supported: U32/S32 + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 + * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. + */ + void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel. + * + * @param[in] input Source tensor info. Data types supported: S32/F16/F32. + * @param[in] prev_output Destination tensor info of the previous iterations. Data types supported: U32/S32 + * Has to be nullptr for the first iteration + * @param[in] output Destination tensor info. Data types supported: U32/S32 + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 + * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_prev_output; + ICLTensor *_output; + unsigned int _reduction_axis; + ReductionOperation _op; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H */ diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h index 172ed8985a..1ed7e6e5aa 100644 --- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h +++ b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H__ -#define __ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H__ +#ifndef ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H +#define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H #include "arm_compute/core/CL/ICLKernel.h" #include "arm_compute/core/Types.h" @@ -32,11 +32,6 @@ namespace arm_compute class ICLTensor; /** Interface for the reduction operation kernel - * - * @note For ARG_MIN/ARG_MAX reduction, the default data type for an uninitialized - * output tensor is signed 32-bit integer (S32). It is the user's responsibility - * to check that the results do not overflow because the indices are computed - * in unsigned 32-bit (U32). */ class CLReductionOperationKernel : public ICLKernel { @@ -57,10 +52,10 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor. Data types supported: QASYMM8/S32/F16/F32. - * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input, U32/S32 for ARG_MIX/ARG_MAX. + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. * Output will have the same number of dimensions as input. * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 - * @param[in] op Reduction operation to perform. + * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX * @param[in] width (Optional) In case of x-axis we also need to provide the width of the input image. */ void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0); @@ -68,10 +63,10 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel. * * @param[in] input Source tensor info. Data types supported: QASYMM8/S32/F16/F32. - * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input, U32/S32 for ARG_MIX/ARG_MAX. + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. * Output will have the same number of dimensions as input. * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 - * @param[in] op Reduction operation to perform. + * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX * @param[in] width (Optional) In case of x-axis we also need to provide the width of the input image. * * @return a status @@ -90,4 +85,4 @@ private: BorderSize _border_size; }; } // namespace arm_compute -#endif /*__ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H__ */ +#endif /*ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H */ diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h index 1b465a4866..21cded0417 100644 --- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h +++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h @@ -21,10 +21,13 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __ARM_COMPUTE_CLARGMINMAXLAYER_H__ -#define __ARM_COMPUTE_CLARGMINMAXLAYER_H__ +#ifndef ARM_COMPUTE_CLARGMINMAXLAYER_H +#define ARM_COMPUTE_CLARGMINMAXLAYER_H +#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h" +#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" @@ -33,7 +36,6 @@ namespace arm_compute { class ITensorInfo; class ICLTensor; -class CLReductionOperation; /** Function to calculate the index of the minimum or maximum values in a * tensor based on an axis. @@ -53,19 +55,18 @@ public: CLArgMinMaxLayer(std::shared_ptr memory_manager = nullptr); /** Set the input and output tensors. * - * @param[in] input Input source tensor, this could be written if @ref CLReductionOperation - * manipulates its border for better performance. Data types supported: F16/F32. + * @param[in] input Input source tensor. Data types supported: F16/F32. * @param[in] axis Axis to find max/min index. * @param[out] output Output source tensor. Data types supported: U32/S32. - * @param[in] op Operation to perform: min or max + * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN */ - void configure(ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op); + void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op); /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayer * * @param[in] input Input source tensor info. Data types supported: F16/F32. * @param[in] axis Axis to find max/min index. * @param[in] output Output source tensor info. Data types supported: U32/S32. - * @param[in] op Operation to perform: min or max + * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN * * @return a status */ @@ -75,7 +76,13 @@ public: void run() override; private: - std::unique_ptr _reduction_function; + MemoryGroup _memory_group; + std::vector _results_vector; + CLTensor _not_reshaped_output; + std::vector _reduction_kernels_vector; + CLReshapeLayerKernel _reshape_kernel; + unsigned int _num_of_stages; + unsigned int _reduction_axis; }; } // namespace arm_compute -#endif /* __ARM_COMPUTE_CLARGMINMAXLAYER_H__ */ +#endif /* ARM_COMPUTE_CLARGMINMAXLAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h index 405e1177fd..9e0bf03ffe 100644 --- a/arm_compute/runtime/CL/functions/CLReductionOperation.h +++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __ARM_COMPUTE_CLREDUCTIONOPERATION_H__ -#define __ARM_COMPUTE_CLREDUCTIONOPERATION_H__ +#ifndef ARM_COMPUTE_CLREDUCTIONOPERATION_H +#define ARM_COMPUTE_CLREDUCTIONOPERATION_H #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h" @@ -57,7 +57,7 @@ public: * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3 - * @param[in] op Reduction operation to perform. + * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true. */ void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true); @@ -67,7 +67,7 @@ public: * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3 - * @param[in] op Reduction operation to perform. + * @param[in] op Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true. * * @return a status @@ -92,4 +92,4 @@ private: bool _is_reshape_required; }; } // namespace arm_compute -#endif /*__ARM_COMPUTE_CLREDUCTIONOPERATION_H__ */ +#endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */ \ No newline at end of file diff --git a/arm_compute/runtime/Utils.h b/arm_compute/runtime/Utils.h index 15c0042a33..9a5b20eb26 100644 --- a/arm_compute/runtime/Utils.h +++ b/arm_compute/runtime/Utils.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef __ARM_COMPUTE_RUNTIME_UTILS_H__ -#define __ARM_COMPUTE_RUNTIME_UTILS_H__ +#ifndef ARM_COMPUTE_RUNTIME_UTILS_H +#define ARM_COMPUTE_RUNTIME_UTILS_H #include "arm_compute/runtime/IRuntimeContext.h" #include "arm_compute/runtime/Scheduler.h" @@ -46,5 +46,12 @@ const std::string &string_from_scheduler_type(Scheduler::Type t); * @param[in] hints Hints to use. */ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints); + +/** Calculate number of stages for parallel implementations + * + * @param[in] input_x_dimension input tensor x dimension + * @param[in] axis axis to be used + */ +unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis); } // namespace arm_compute -#endif /* __ARM_COMPUTE_RUNTIME_UTILS_H__ */ +#endif /* ARM_COMPUTE_RUNTIME_UTILS_H */ -- cgit v1.2.1