From 7e9391bb14d219cda310bff355669b5964b1f576 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Fri, 5 Oct 2018 14:49:28 +0100 Subject: COMPMID-1574 Implement ReduceMean in OpenCL Change-Id: Id331199f569f52a37280a9ada5bf84694580b93c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/152843 Tested-by: bsgcomp Reviewed-by: Michele DiGiorgio --- .../core/CL/kernels/CLReductionOperationKernel.h | 14 +- arm_compute/core/Types.h | 1 + arm_compute/runtime/CL/CLFunctions.h | 1 + arm_compute/runtime/CL/functions/CLReduceMean.h | 78 ++++++ .../runtime/CL/functions/CLReductionOperation.h | 6 +- src/core/CL/CLKernelLibrary.cpp | 6 +- src/core/CL/cl_kernels/reduction_operation.cl | 196 ++++++++++++++- src/core/CL/kernels/CLReductionOperationKernel.cpp | 270 ++++++++++++++++----- src/runtime/CL/functions/CLReduceMean.cpp | 122 ++++++++++ src/runtime/CL/functions/CLReductionOperation.cpp | 156 +++++++----- tests/datasets/ReductionOperationDataset.h | 4 +- tests/validation/CL/ReduceMean.cpp | 172 +++++++++++++ tests/validation/CL/ReductionOperation.cpp | 12 +- tests/validation/NEON/ReductionOperation.cpp | 4 +- tests/validation/fixtures/ReduceMeanFixture.h | 161 ++++++++++++ tests/validation/reference/ReductionOperation.cpp | 184 +++++++++++++- tests/validation/reference/ReductionOperation.h | 2 +- utils/TypePrinter.h | 6 + 18 files changed, 1244 insertions(+), 151 deletions(-) create mode 100644 arm_compute/runtime/CL/functions/CLReduceMean.h create mode 100644 src/runtime/CL/functions/CLReduceMean.cpp create mode 100644 tests/validation/CL/ReduceMean.cpp create mode 100644 tests/validation/fixtures/ReduceMeanFixture.h diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h index 60e2f08005..ef24fd5166 100644 --- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h +++ b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h @@ -50,25 +50,27 @@ public: /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW. * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 * @param[in] op Reduction operation to perform. + * @param[in] width (Optional) In case of x-axis we also need to provide the width of the input image. */ - void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op); + void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0); /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel. * - * @param[in] input Source tensor info. Data types supported: F16/F32. Data layouts supported: NCHW. + * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW. * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 * @param[in] op Reduction operation to perform. + * @param[in] width (Optional) In case of x-axis we also need to provide the width of the input image. * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width = 0); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 9b81eccafb..c0350bc7a4 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -522,6 +522,7 @@ enum class ReductionOperation { SUM_SQUARE, /**< Sum of squares */ SUM, /**< Sum */ + MEAN_SUM, /**< Mean of sum */ }; /** The normalization type used for the normalization layer */ diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index d87a8285f4..6a614f7704 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -104,6 +104,7 @@ #include "arm_compute/runtime/CL/functions/CLRNNLayer.h" #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h" #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h" +#include "arm_compute/runtime/CL/functions/CLReduceMean.h" #include "arm_compute/runtime/CL/functions/CLReductionOperation.h" #include "arm_compute/runtime/CL/functions/CLRemap.h" #include "arm_compute/runtime/CL/functions/CLReorgLayer.h" diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h new file mode 100644 index 0000000000..5a919e5dcd --- /dev/null +++ b/arm_compute/runtime/CL/functions/CLReduceMean.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CL_REDUCE_MEAN_H__ +#define __ARM_COMPUTE_CL_REDUCE_MEAN_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/runtime/CL/functions/CLArithmeticDivision.h" +#include "arm_compute/runtime/CL/functions/CLReductionOperation.h" +#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" +#include "arm_compute/runtime/IMemoryManager.h" + +namespace arm_compute +{ +// Forward Declarations +class ICLTensor; + +/** Basic function to perform reduce operation */ +class CLReduceMean : public IFunction +{ +public: + /** Default constructor */ + CLReduceMean(std::shared_ptr memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref CLReduceMean + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + CLMemoryGroup _memory_group; + std::unique_ptr _reduction_kernels{ nullptr }; + std::unique_ptr _reduced_outs{ nullptr }; + CLReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CL_REDUCE_MEAN_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h index d862aff7b7..42081786bc 100644 --- a/arm_compute/runtime/CL/functions/CLReductionOperation.h +++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h @@ -53,7 +53,7 @@ public: /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW. * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 * @param[in] op Reduction operation to perform. @@ -62,7 +62,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation. * - * @param[in] input Source tensor info. Data types supported: F16/F32. Data layouts supported: NCHW. + * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW. * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 * @param[in] op Reduction operation to perform. @@ -80,6 +80,8 @@ private: std::unique_ptr _reduction_kernels_vector{ nullptr }; std::unique_ptr _border_handlers_vector{ nullptr }; unsigned int _num_of_stages; + unsigned int _reduction_axis; + bool _is_quantized; }; } #endif /*__ARM_COMPUTE_CLREDUCTIONOPERATION_H__ */ diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index ce4b85551d..957543c877 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -342,7 +342,11 @@ const std::map CLKernelLibrary::_kernel_program_map = { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" }, { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" }, { "quantization_layer", "quantization_layer.cl" }, - { "reduction_operation", "reduction_operation.cl" }, + { "reduction_operation_x", "reduction_operation.cl" }, + { "reduction_operation_quantized_x", "reduction_operation.cl" }, + { "reduction_operation_y", "reduction_operation.cl" }, + { "reduction_operation_z", "reduction_operation.cl" }, + { "reduction_operation_w", "reduction_operation.cl" }, { "remap_nearest_neighbour", "remap.cl" }, { "remap_bilinear", "remap.cl" }, { "reorg_layer_nchw", "reorg_layer.cl" }, diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl index aa7403b52b..c1be4472a7 100644 --- a/src/core/CL/cl_kernels/reduction_operation.cl +++ b/src/core/CL/cl_kernels/reduction_operation.cl @@ -61,13 +61,14 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) return (in.s0 + in.s1); } -/** This kernel performs reduction given an operation. +/** This kernel performs parallel reduction given an operation on x-axis. * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum + * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value + * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -81,7 +82,7 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[in] local_sums Local buffer for storing the partial sum */ -__kernel void reduction_operation( +__kernel void reduction_operation_x( IMAGE_DECLARATION(src), IMAGE_DECLARATION(partial_sum), __local DATA_TYPE *local_sums) @@ -109,7 +110,192 @@ __kernel void reduction_operation( if(lid == 0) { +#if defined(MEAN) && defined(WIDTH) + if(y == get_local_size(1) - 1) + { + local_sums[0] /= WIDTH; + } +#endif /* defined(MEAN) && defined(WIDTH) */ ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0]; } } -} \ No newline at end of file +} + +#if defined(WIDTH) +/** This kernel performs reduction on x-axis. (QASYMM8) + * + * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_quantized_x( + VECTOR_DECLARATION(src), + VECTOR_DECLARATION(output)) +{ + Vector src = CONVERT_TO_VECTOR_STRUCT(src); + Vector output = CONVERT_TO_VECTOR_STRUCT(output); + + uint res = 0; + + for(unsigned int x = 0; x < WIDTH; ++x) + { + res += *((__global uchar *)vector_offset(&src, x)); + } + +#if defined(MEAN) + res /= WIDTH; +#endif /* defined(MEAN) */ + + // Store result + *((__global uchar *)output.ptr) = convert_uchar(res); +} +#endif /* defined(HEIGHT) */ + +#if defined(HEIGHT) +/** This kernel performs reduction on y-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_y( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(output)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + res = 0; + + for(unsigned int y = 0; y < HEIGHT; ++y) + { + res += CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + } + +#if defined(MEAN) + res /= HEIGHT; +#endif /* defined(MEAN) */ + + // Store result + vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif /* defined(HEIGHT) */ + +#if defined(DEPTH) +/** This kernel performs reduction on z-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_z( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + res = 0; + + for(unsigned int z = 0; z < DEPTH; ++z) + { + res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + } + +#if defined(MEAN) + res /= DEPTH; +#endif /* defined(MEAN) */ + + // Store result + vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif /* defined(DEPTH) */ + +#if defined(BATCH) && defined(DEPTH) +/** This kernel performs reduction on w-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128 + * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_w( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) +{ + Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + res = 0; + + for(unsigned int w = 0; w < BATCH; ++w) + { + res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + } + +#if defined(MEAN) + res /= BATCH; +#endif /* defined(MEAN) */ + + // Store result + vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif /* defined(BATCH) && defined(DEPTH) */ \ No newline at end of file diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index bf36ae2c0f..d4165ccd4e 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -39,24 +39,22 @@ using namespace arm_compute; namespace { -// OpenCL kernel requires input width to be a power of 2. +// OpenCL kernel requires input width to be a power of 2 for x-axis. constexpr unsigned int border_val = 64; -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width) { - ARM_COMPUTE_UNUSED(op); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); - + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && axis != 0, "Not supported reduction operation for this axis"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8); if(output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); } return Status{}; @@ -69,16 +67,44 @@ std::tuple validate_and_configure_window(ITensorInfo *input, ITe output_shape.set(axis, 1); auto_init_if_empty(*output, output_shape, 1, input->data_type()); - const unsigned int num_elems_processed_per_iteration = 16; - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0; + const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16; + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + bool window_changed = false; - AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1); - AccessWindowHorizontal output_access(output, 0, 1); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, output->valid_region()); + switch(axis) + { + case 0: + { + if(is_data_type_quantized(input->data_type())) + { + AccessWindowHorizontal input_access(input, 0, input->dimension(0)); + AccessWindowHorizontal output_access(output, 0, 1); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + else + { + const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0; + AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1); + AccessWindowHorizontal output_access(output, 0, 1); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + } + break; + case 1: + case 2: + case 3: + { + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -96,46 +122,85 @@ BorderSize CLReductionOperationKernel::border_size() const return _border_size; } -void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - - const unsigned int num_elems_processed_per_iteration = 16; - const unsigned int width_leftover = input->info()->dimension(0) % border_val; - const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0; - const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op, width)); _input = input; _output = output; _reduction_axis = axis; _op = op; - // Set the number of WG based on the input size. If input width is < 128 - // we can use fewer threads than 8. - cl::NDRange lws_hint = cl::NDRange(std::min(8U, num_of_threads)); - _border_size = BorderSize(0, border_width, 0, 0); - // Set build options - std::set build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + CLBuildOptions build_opts; + std::string data_type_promoted = get_cl_type_from_data_type(input->info()->data_type()); + if(is_data_type_quantized(input->info()->data_type()) && axis != 0) + { + data_type_promoted = "uint"; + } + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted); + build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN"); switch(op) { case ReductionOperation::SUM_SQUARE: - build_opts.emplace(("-DOPERATION=square_sum")); + build_opts.add_option(("-DOPERATION=square_sum")); break; case ReductionOperation::SUM: - build_opts.emplace(("-DOPERATION=sum")); + case ReductionOperation::MEAN_SUM: + build_opts.add_option(("-DOPERATION=sum")); break; default: ARM_COMPUTE_ERROR("Unsupported reduction operation"); } // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel("reduction_operation", build_opts)); + cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange(); + std::string kernel_axis_name; + switch(axis) + { + case 0: + { + if(!is_data_type_quantized(input->info()->data_type())) + { + build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width)); + const unsigned int width_leftover = input->info()->dimension(0) % border_val; + const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0; + const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16); + kernel_axis_name = "x"; + + // Set the number of WG based on the input size. If input width is < 128 + // we can use fewer threads than 8. + lws_hint = cl::NDRange(std::min(8U, num_of_threads)); + _border_size = BorderSize(0, border_width, 0, 0); + } + else + { + build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); + kernel_axis_name = "quantized_x"; + } + } + break; + case 1: + build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); + kernel_axis_name = "y"; + break; + case 2: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + kernel_axis_name = "z"; + break; + case 3: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3))); + kernel_axis_name = "w"; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + _kernel = static_cast(CLKernelLibrary::get().create_kernel("reduction_operation_" + kernel_axis_name, build_opts.options())); // Configure kernel window auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis); @@ -145,9 +210,9 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou ICLKernel::configure_internal(std::get<1>(win_config), lws_hint); } -Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op, width)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis))); return Status{}; @@ -158,28 +223,113 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - // Set out window - Window out_window(window); - out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - - // Get first input and output slices - Window in_slice = window.first_slice_window_2D(); - Window out_slice = out_window.first_slice_window_2D(); - - // Reshape window - const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0; - in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step())); - - // Set local sums buffer - unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size(); - _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr); - - do + switch(_reduction_axis) { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - add_2D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice, lws_hint()); + case 0: + { + // We use parallel reduction only in non quantized types + if(!is_data_type_quantized(_input->info()->data_type())) + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + // Get first input and output slices + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + // Reshape window + const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0; + in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step())); + + // Set local sums buffer + unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size(); + _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } + while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } + else + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + + Window in_slice = window.first_slice_window_1D(); + Window out_slice = window.first_slice_window_1D(); + + do + { + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, in_slice); + add_1D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice)); + } + } + break; + case 1: + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); + Window in_slice = window_in.first_slice_window_2D(); + Window out_slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } + break; + case 2: + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); + Window in_slice = window_in.first_slice_window_3D(); + Window out_slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_3D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); + } + break; + case 3: + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(3, Window::Dimension(0, 1, 1)); + Window in_slice = window_in.first_slice_window_4D(); + Window out_slice = window.first_slice_window_4D(); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, in_slice); + add_4D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); } - while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp new file mode 100644 index 0000000000..6e55b81d1c --- /dev/null +++ b/src/runtime/CL/functions/CLReduceMean.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLReduceMean.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/helpers/tensor_transform.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +CLReduceMean::CLReduceMean(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims() +{ +} +void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels = arm_compute::support::cpp14::make_unique(_reduction_ops); + _reduced_outs = arm_compute::support::cpp14::make_unique(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + // Perform reduction for every axis + for(unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape(); + out_shape.set(reduction_axis[i], 1); + auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1); + + if(i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info())); + _memory_group.manage(_reduced_outs.get() + i); + _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM); + } + } + + // Allocate intermediate tensors + for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if(!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + for(unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(reduction_axis[i]); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output); + } +} + +Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis[i] > 3); + if(output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast(reduction_axis[i]) > input->num_dimensions() - 1); + } + + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM, 0)); + } + + return Status{}; +} + +void CLReduceMean::run() +{ + _memory_group.acquire(); + + for(unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if(!_keep_dims) + { + _reshape.run(); + } + _memory_group.release(); +} +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index 2a171c3969..4b65c47392 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -37,8 +37,13 @@ using namespace arm_compute; namespace { -unsigned int calculate_number_of_stages(const ITensorInfo *input) +unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis) { + // We need only 1 stage for all axis except x-axis and x-axis for QASYMM8. + if(axis != 0 || (axis == 0 && is_data_type_quantized(input->data_type()))) + { + return 1; + } // Calculate number of WGs. 16 elements per thread, 8 threads per WG const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f); @@ -51,91 +56,132 @@ unsigned int calculate_number_of_stages(const ITensorInfo *input) } // namespace CLReductionOperation::CLReductionOperation(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages() + : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized() { } Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) { - const unsigned int num_of_stages = calculate_number_of_stages(input); - - // Create temporary tensor infos - auto sums_vector = arm_compute::support::cpp14::make_unique(num_of_stages - 1); - - // Create intermediate tensor info - TensorShape shape{ input->tensor_shape() }; + const unsigned int num_of_stages = calculate_number_of_stages(input, axis); - for(unsigned int i = 0; i < num_of_stages - 1; i++) + if(axis == 0 && !is_data_type_quantized(input->data_type())) { - shape.set(0, ceil(shape.x() / 128.f)); - sums_vector[i].set_data_type(input->data_type()); - sums_vector[i].set_tensor_shape(shape); - sums_vector[i].set_num_channels(input->num_channels()); + // Create temporary tensor infos + auto sums_vector = arm_compute::support::cpp14::make_unique(num_of_stages - 1); + + // Create intermediate tensor info + TensorShape shape{ input->tensor_shape() }; + + for(unsigned int i = 0; i < num_of_stages - 1; i++) + { + shape.set(0, ceil(shape.x() / 128.f)); + sums_vector[i].set_data_type(input->data_type()); + sums_vector[i].set_tensor_shape(shape); + sums_vector[i].set_num_channels(input->num_channels()); + } + + // Validate ReductionOperation only on first kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op)); + + // Validate ReductionOperation on intermediate stages + for(unsigned int i = 1; i < num_of_stages - 1; ++i) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op)); + } + + // Validate ReductionOperation on the last stage + const unsigned int last_stage = num_of_stages - 1; + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op)); } - - // Validate ReductionOperation only on first kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op)); - - // Validate ReductionOperation on intermediate stages - for(unsigned int i = 1; i < num_of_stages - 1; ++i) + else { - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op)); } - // Validate ReductionOperation on the last stage - const unsigned int last_stage = num_of_stages - 1; - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op)); - return Status{}; } void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) { - _num_of_stages = calculate_number_of_stages(input->info()); - - // Create temporary tensors - _sums_vector = arm_compute::support::cpp14::make_unique(_num_of_stages - 1); + _num_of_stages = calculate_number_of_stages(input->info(), axis); + _reduction_axis = axis; + _is_quantized = is_data_type_quantized(input->info()->data_type()); // Configure reduction operation kernels _reduction_kernels_vector = arm_compute::support::cpp14::make_unique(_num_of_stages); - _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_of_stages); - TensorShape shape{ input->info()->tensor_shape() }; - for(unsigned int i = 0; i < _num_of_stages - 1; i++) + // Create temporary tensors + if(axis == 0 && !_is_quantized) { - shape.set(0, ceil(shape.x() / 128.f)); - _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); + _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_of_stages); + _sums_vector = arm_compute::support::cpp14::make_unique(_num_of_stages - 1); + TensorShape shape{ input->info()->tensor_shape() }; + for(unsigned int i = 0; i < _num_of_stages - 1; i++) + { + shape.set(0, ceil(shape.x() / 128.f)); + _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); + } + + // Apply ReductionOperation only on first kernel + _memory_group.manage(_sums_vector.get()); + + ReductionOperation first_kernel_op; + ReductionOperation last_kernel_op; + switch(op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + first_kernel_op = ReductionOperation::SUM; + last_kernel_op = op; + break; + case ReductionOperation::SUM_SQUARE: + first_kernel_op = ReductionOperation::SUM_SQUARE; + last_kernel_op = ReductionOperation::SUM; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + + _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op); + _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0)); + + // Apply ReductionOperation on intermediate stages + for(unsigned int i = 1; i < _num_of_stages - 1; ++i) + { + _memory_group.manage(_sums_vector.get() + i); + _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM); + _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0)); + _sums_vector[i - 1].allocator()->allocate(); + } + + // Apply ReductionOperation on the last stage + const unsigned int last_stage = _num_of_stages - 1; + const unsigned int input_width = input->info()->dimension(0); + _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width); + _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0)); + _sums_vector[last_stage - 1].allocator()->allocate(); } - - // Apply ReductionOperation only on first kernel - _memory_group.manage(_sums_vector.get()); - _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, op); - _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0)); - - // Apply ReductionOperation on intermediate stages - for(unsigned int i = 1; i < _num_of_stages - 1; ++i) + else { - _memory_group.manage(_sums_vector.get() + i); - _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM); - _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0)); - _sums_vector[i - 1].allocator()->allocate(); + _reduction_kernels_vector[0].configure(input, output, axis, op, 0); } - - // Apply ReductionOperation on the last stage - const unsigned int last_stage = _num_of_stages - 1; - _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, ReductionOperation::SUM); - _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0)); - _sums_vector[last_stage - 1].allocator()->allocate(); } void CLReductionOperation::run() { _memory_group.acquire(); - for(unsigned int i = 0; i < _num_of_stages; ++i) + if(_reduction_axis == 0 && !_is_quantized) + { + for(unsigned int i = 0; i < _num_of_stages; ++i) + { + CLScheduler::get().enqueue(_border_handlers_vector[i], false); + CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); + } + } + else { - CLScheduler::get().enqueue(_border_handlers_vector[i], false); - CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); + CLScheduler::get().enqueue(_reduction_kernels_vector[0], false); } _memory_group.release(); diff --git a/tests/datasets/ReductionOperationDataset.h b/tests/datasets/ReductionOperationDataset.h index e710588921..dadc4e9249 100644 --- a/tests/datasets/ReductionOperationDataset.h +++ b/tests/datasets/ReductionOperationDataset.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -42,7 +42,7 @@ public: ReductionOperations() : ContainerDataset("ReductionOperation", { - ReductionOperation::SUM_SQUARE + ReductionOperation::SUM }) { } diff --git a/tests/validation/CL/ReduceMean.cpp b/tests/validation/CL/ReduceMean.cpp new file mode 100644 index 0000000000..07e859f391 --- /dev/null +++ b/tests/validation/CL/ReduceMean.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "arm_compute/runtime/CL/functions/CLReduceMean.h" + +#include "tests/CL/CLAccessor.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/datasets/SplitDataset.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/ReduceMeanFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +constexpr AbsoluteTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */ +constexpr AbsoluteTolerance tolerance_f16(0.03f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */ +constexpr AbsoluteTolerance tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric quantized type */ + +const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(0, 1, 2, 3) }), + framework::dataset::make("KeepDims", { true })); +const auto axis_drop = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1), Coordinates(3) }), framework::dataset::make("KeepDims", { false })); +} // namespace +TEST_SUITE(CL) +TEST_SUITE(ReduceMean) + +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( + framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid axis + TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid output shape + TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32) + }), + framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32) + })), + framework::dataset::make("Axis", { Coordinates(4), Coordinates(0,2), Coordinates(2) })), + framework::dataset::make("Expected", { false, false, true })), + input_info, output_info, axis, expected) +{ + const Status status = CLReduceMean::validate(&input_info.clone()->set_is_resizable(false), axis, true, &output_info.clone()->set_is_resizable(false)); + ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + +DATA_TEST_CASE(Configuration, + framework::DatasetMode::ALL, + combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })), + shape, data_type) +{ + // Create tensors + CLTensor ref_src = create_tensor(shape, data_type); + CLTensor dst; + + Coordinates axis(1); + + // Create and Configure function + CLReduceMean reduce_mean; + reduce_mean.configure(&ref_src, axis, true, &dst); + + // Validate valid region + TensorShape output_shape = shape; + output_shape.set(1, 1); + const ValidRegion valid_region = shape_to_valid_region(output_shape); + validate(dst.info()->valid_region(), valid_region); +} + +template +using CLReduceMeanFixture = ReduceMeanFixture; + +TEST_SUITE(Float) +TEST_SUITE(FP16) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReduceMeanFixture, + framework::DatasetMode::PRECOMMIT, + combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), concat(axis_keep, axis_drop))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f16); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, + CLReduceMeanFixture, + framework::DatasetMode::NIGHTLY, + combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F16)), concat(axis_keep, axis_drop))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f16); +} +TEST_SUITE_END() // FP16 + +TEST_SUITE(FP32) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReduceMeanFixture, + framework::DatasetMode::PRECOMMIT, + combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), concat(axis_keep, axis_drop))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, + CLReduceMeanFixture, + framework::DatasetMode::NIGHTLY, + combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), concat(axis_keep, axis_drop))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} +TEST_SUITE_END() // FP32 +TEST_SUITE_END() // Float + +template +using CLReduceMeanQuantizedFixture = ReduceMeanQuantizedFixture; + +TEST_SUITE(Quantized) +TEST_SUITE(QASYMM8) +FIXTURE_DATA_TEST_CASE(RunSmall, + CLReduceMeanQuantizedFixture, + framework::DatasetMode::PRECOMMIT, + combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, + CLReduceMeanQuantizedFixture, + framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QASYMM8 +TEST_SUITE_END() // Quantized +TEST_SUITE_END() // ReduceMean +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp index ca0988f955..794db1a3e5 100644 --- a/tests/validation/CL/ReductionOperation.cpp +++ b/tests/validation/CL/ReductionOperation.cpp @@ -45,7 +45,7 @@ namespace { /** Tolerance for float operations */ RelativeTolerance tolerance_f32(0.00001f); -RelativeTolerance tolerance_f16(0.1f); +AbsoluteTolerance tolerance_f16(0.1f); } // namespace TEST_SUITE(CL) @@ -58,7 +58,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1 TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F16/F32 TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions - TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0 + TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0 and SUM_SQUARE TensorInfo(TensorShape(128U, 64U), 1, DataType::F32) }), framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(1U, 64U), 1, DataType::F16), @@ -87,13 +87,13 @@ using CLReductionOperationFixture = ReductionOperationValidationFixture, framework::DatasetMode::PRECOMMIT, - combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0 })), datasets::ReductionOperations())) + combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations())) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16); } FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0 })), datasets::ReductionOperations())) + combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations())) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16); @@ -101,13 +101,13 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture, framework::D TEST_SUITE_END() // F16 TEST_SUITE(FP32) FIXTURE_DATA_TEST_CASE(RunSmall, CLReductionOperationFixture, framework::DatasetMode::PRECOMMIT, - combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), datasets::ReductionOperations())) + combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations())) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), datasets::ReductionOperations())) + combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations())) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); diff --git a/tests/validation/NEON/ReductionOperation.cpp b/tests/validation/NEON/ReductionOperation.cpp index c2f2909c66..b0480b0bc6 100644 --- a/tests/validation/NEON/ReductionOperation.cpp +++ b/tests/validation/NEON/ReductionOperation.cpp @@ -85,13 +85,13 @@ using NEReductionOperationFixture = ReductionOperationValidationFixture, framework::DatasetMode::PRECOMMIT, - combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), datasets::ReductionOperations())) + combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Op", { ReductionOperation::SUM_SQUARE }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } FIXTURE_DATA_TEST_CASE(RunLarge, NEReductionOperationFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), datasets::ReductionOperations())) + combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Op", { ReductionOperation::SUM_SQUARE }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h new file mode 100644 index 0000000000..6debd4a038 --- /dev/null +++ b/tests/validation/fixtures/ReduceMeanFixture.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_TEST_REDUCE_MEAN_FIXTURE +#define ARM_COMPUTE_TEST_REDUCE_MEAN_FIXTURE + +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/Tensor.h" +#include "tests/AssetsLibrary.h" +#include "tests/Globals.h" +#include "tests/IAccessor.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Fixture.h" +#include "tests/validation/reference/ReductionOperation.h" +#include "tests/validation/reference/ReshapeLayer.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +template +class ReduceMeanValidationFixture : public framework::Fixture +{ +public: + template + void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info) + { + _target = compute_target(shape, data_type, axis, keep_dims, quantization_info); + _reference = compute_reference(shape, data_type, axis, keep_dims, quantization_info); + } + +protected: + template + void fill(U &&tensor) + { + if(!is_data_type_quantized(tensor.data_type())) + { + std::uniform_real_distribution<> distribution(-1.0f, 1.0f); + library->fill(tensor, distribution, 0); + } + else + { + const QuantizationInfo quant_info = tensor.quantization_info(); + const int min_bound = quant_info.quantize(-1.f, RoundingPolicy::TO_NEAREST_UP); + const int max_bound = quant_info.quantize(1.f, RoundingPolicy::TO_NEAREST_UP); + std::uniform_int_distribution<> distribution(min_bound, max_bound); + + library->fill(tensor, distribution, 0); + } + } + + TensorType compute_target(TensorShape &src_shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info) + { + // Create tensors + TensorType src = create_tensor(src_shape, data_type, 1, quantization_info); + TensorType dst; + + // Create and configure function + FunctionType reduction_mean; + reduction_mean.configure(&src, axis, keep_dims, &dst); + + ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Allocate tensors + src.allocator()->allocate(); + dst.allocator()->allocate(); + + ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Fill tensors + fill(AccessorType(src)); + + // Compute function + reduction_mean.run(); + + return dst; + } + + SimpleTensor compute_reference(TensorShape &src_shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info) + { + // Create reference + SimpleTensor src{ src_shape, data_type, 1, quantization_info }; + + // Fill reference + fill(src); + + SimpleTensor out; + for(unsigned int i = 0; i < axis.num_dimensions(); ++i) + { + TensorShape output_shape = i == 0 ? src_shape : out.shape(); + output_shape.set(axis[i], 1); + out = reference::reduction_operation(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM); + } + + if(!keep_dims) + { + TensorShape output_shape = src_shape; + for(unsigned int i = 0; i < axis.num_dimensions(); ++i) + { + output_shape.remove_dimension(axis[i]); + } + + out = reference::reshape_layer(out, output_shape); + } + return out; + } + + TensorType _target{}; + SimpleTensor _reference{}; +}; + +template +class ReduceMeanQuantizedFixture : public ReduceMeanValidationFixture +{ +public: + template + void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info = QuantizationInfo()) + { + ReduceMeanValidationFixture::setup(shape, data_type, axis, keep_dims, quantization_info); + } +}; + +template +class ReduceMeanFixture : public ReduceMeanValidationFixture +{ +public: + template + void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims) + { + ReduceMeanValidationFixture::setup(shape, data_type, axis, keep_dims, QuantizationInfo()); + } +}; +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif /* ARM_COMPUTE_TEST_REDUCE_MEAN_FIXTURE */ diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp index 871a761b1a..11947bd293 100644 --- a/tests/validation/reference/ReductionOperation.cpp +++ b/tests/validation/reference/ReductionOperation.cpp @@ -47,6 +47,15 @@ struct square } }; +template +struct sum +{ + T operator()(const T &lhs, const T &rhs) const + { + return (lhs + rhs); + } +}; + template T reduce_operation(T *ptr, int reduce_elements, ReductionOperation op) { @@ -54,6 +63,9 @@ T reduce_operation(T *ptr, int reduce_elements, ReductionOperation op) { case ReductionOperation::SUM_SQUARE: return std::accumulate(ptr, ptr + reduce_elements, static_cast(0), square()); + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + return std::accumulate(ptr, ptr + reduce_elements, static_cast(0), sum()); default: ARM_COMPUTE_ERROR("Unsupported reduction operation"); } @@ -64,23 +76,172 @@ template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op) { // Create reference - SimpleTensor dst{ dst_shape, src.data_type() }; + SimpleTensor dst{ dst_shape, src.data_type() }; + const unsigned int src_width = src.shape().x(); + const unsigned int src_height = src.shape().y(); + const unsigned int src_depth = src.shape().z(); + const unsigned int src_batch = src.shape()[3]; + const bool mean = op == ReductionOperation::MEAN_SUM; - // Compute reference - const int reduce_elems = src.shape()[axis]; - const int upper_dims = src.shape().total_size_upper(axis + 1); - - for(int du = 0; du < upper_dims; ++du) + switch(axis) { - if(axis == 0) + case 0: + { + const int reduce_elems = src.shape()[axis]; + const unsigned int upper_dims = src.shape().total_size_upper(1); + for(unsigned int du = 0; du < upper_dims; ++du) + { + if(std::is_integral::value) + { + uint32_t res = 0; + for(unsigned int x = 0; x < src_width; ++x) + { + res += static_cast(src[du * src_width + x]); + } + if(mean && src_width > 0) + { + res /= src_width; + } + dst[du] = static_cast(res); + } + else + { + const T *src_row_ptr = src.data() + du * reduce_elems; + + auto res = reduce_operation(src_row_ptr, reduce_elems, op); + if(mean && src_width > 0) + { + res /= src_width; + } + dst[du] = res; + } + } + } + break; + case 1: { - const T *src_row_ptr = src.data() + du * reduce_elems; - dst[du] = reduce_operation(src_row_ptr, reduce_elems, op); + const unsigned int upper_dims = src.shape().total_size_upper(2); + for(unsigned int du = 0; du < upper_dims; ++du) + { + for(unsigned int x = 0; x < src_width; ++x) + { + if(std::is_integral::value) + { + uint32_t res = 0; + for(unsigned int y = 0; y < src_height; ++y) + { + res += static_cast(src[du * src_height * src_width + y * src_width + x]); + } + if(mean && src_height > 0) + { + res /= src_height; + } + dst[du * src_width + x] = static_cast(res); + } + else + { + auto res = T(0); + for(unsigned int y = 0; y < src_height; ++y) + { + res += src[du * src_height * src_width + y * src_width + x]; + } + if(mean && src_height > 0) + { + res /= src_height; + } + dst[du * src_width + x] = res; + } + } + } } - else + break; + case 2: { - ARM_COMPUTE_ERROR("Unsupported reduction axis"); + const unsigned int upper_dims = src.shape().total_size_upper(3); + for(unsigned int du = 0; du < upper_dims; ++du) + { + for(unsigned int x = 0; x < src_width; ++x) + { + for(unsigned int y = 0; y < src_height; ++y) + { + if(std::is_integral::value) + { + uint32_t res = T(0); + for(unsigned int z = 0; z < src_depth; ++z) + { + res += static_cast(src[du * src_depth * src_height * src_width + z * src_height * src_width + y * src_width + x]); + } + if(mean && src_depth > 0) + { + res /= src_depth; + } + dst[du * src_width * src_height + y * src_width + x] = static_cast(res); + } + else + { + auto res = T(0); + for(unsigned int z = 0; z < src_depth; ++z) + { + res += src[du * src_depth * src_height * src_width + z * src_height * src_width + y * src_width + x]; + } + if(mean && src_depth > 0) + { + res /= src_depth; + } + dst[du * src_width * src_height + y * src_width + x] = res; + } + } + } + } } + break; + case 3: + { + const unsigned int upper_dims = src.shape().total_size_upper(4); + for(unsigned int du = 0; du < upper_dims; ++du) + { + for(unsigned int z = 0; z < src_depth; ++z) + { + for(unsigned int y = 0; y < src_height; ++y) + { + for(unsigned int x = 0; x < src_width; ++x) + { + if(std::is_integral::value) + { + uint32_t res = 0; + for(unsigned int w = 0; w < src_batch; ++w) + { + res += static_cast(src[du * src_batch * src_depth * src_height * src_width + w * src_width * src_height * src_depth + z * src_width * src_height + y * src_width + x]); + } + if(mean && src_batch > 0) + { + res /= src_batch; + } + + dst[du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x] = static_cast(res); + } + else + { + auto res = T(0); + for(unsigned int w = 0; w < src_batch; ++w) + { + res += src[du * src_batch * src_depth * src_height * src_width + w * src_width * src_height * src_depth + z * src_width * src_height + y * src_width + x]; + } + if(mean && src_batch > 0) + { + res /= src_batch; + } + + dst[du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x] = res; + } + } + } + } + } + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported reduction axis"); } return dst; @@ -88,6 +249,7 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); +template SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h index 6da6436686..859b57aa7b 100644 --- a/tests/validation/reference/ReductionOperation.h +++ b/tests/validation/reference/ReductionOperation.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h index 10e407ddfa..b868aa94a1 100644 --- a/utils/TypePrinter.h +++ b/utils/TypePrinter.h @@ -1220,9 +1220,15 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ReductionOperation & { switch(op) { + case ReductionOperation::SUM: + os << "SUM"; + break; case ReductionOperation::SUM_SQUARE: os << "SUM_SQUARE"; break; + case ReductionOperation::MEAN_SUM: + os << "MEAN_SUM"; + break; default: ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); } -- cgit v1.2.1