From d1794ebfa10d05af7d2458c5d506152fd38068d3 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Fri, 15 Jun 2018 16:15:26 +0100 Subject: COMPMID-1226 Extend CLMeanStdDev to support FP32 / FP16 - Extend support for FP16 in CLReduction. - For F16/F32 MeanStdDev we perform one reduction operation for mean and one for stddev and we calculate the final result in the host CPU. Change-Id: Iad2099f26c0ba7969737d22f00c6c275634d875c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/135870 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- src/core/CL/kernels/CLMeanStdDevKernel.cpp | 23 ++++++++++++++++------ src/core/CL/kernels/CLReductionOperationKernel.cpp | 4 ++-- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'src/core/CL/kernels') diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp index fc8764dbfe..bd31131fe5 100644 --- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp +++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h" +#include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -49,14 +50,24 @@ BorderSize CLMeanStdDevKernel::border_size() const return _border_size; } +Status CLMeanStdDevKernel::validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared) +{ + ARM_COMPUTE_UNUSED(mean); + ARM_COMPUTE_UNUSED(stddev); + ARM_COMPUTE_UNUSED(global_sum); + ARM_COMPUTE_UNUSED(global_sum_squared); + ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED(); + ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + + return Status{}; +} + void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared) { - ARM_COMPUTE_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED(); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(nullptr == mean); - ARM_COMPUTE_ERROR_ON(nullptr == global_sum); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, global_sum); ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared); + ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevKernel::validate(input->info(), mean, global_sum, stddev, global_sum_squared)); _input = input; _mean = mean; diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index d64f0d89c5..95967fa974 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -44,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_UNUSED(op); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); @@ -69,7 +69,7 @@ std::tuple validate_and_configure_window(ITensorInfo *input, ITe const unsigned int num_elems_processed_per_iteration = 16; Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - const unsigned int border_width = ((input->dimension(0) % 128) != 0) ? 128 - input->dimension(0) % 128 : 0; // TODO (COMPMID-1143): Fix padding (possible value 127!) + const unsigned int border_width = ((input->dimension(0) % num_elems_processed_per_iteration) != 0) ? num_elems_processed_per_iteration - input->dimension(0) % num_elems_processed_per_iteration : 0; AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1); AccessWindowHorizontal output_access(output, 0, 1); -- cgit v1.2.1