aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/CL/functions/CLReductionOperation.cpp
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2018-10-05 14:49:28 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:55:45 +0000
commit7e9391bb14d219cda310bff355669b5964b1f576 (patch)
tree789142f63d4c8e95612b042d07a0683cfe596fb9 /src/runtime/CL/functions/CLReductionOperation.cpp
parent555c3d6448a1dc7b326fad2ab7f75eccc8e5cff6 (diff)
downloadComputeLibrary-7e9391bb14d219cda310bff355669b5964b1f576.tar.gz
COMPMID-1574 Implement ReduceMean in OpenCL
Change-Id: Id331199f569f52a37280a9ada5bf84694580b93c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/152843 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Diffstat (limited to 'src/runtime/CL/functions/CLReductionOperation.cpp')
-rw-r--r--src/runtime/CL/functions/CLReductionOperation.cpp156
1 files changed, 101 insertions, 55 deletions
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 2a171c3969..4b65c47392 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -37,8 +37,13 @@ using namespace arm_compute;
namespace
{
-unsigned int calculate_number_of_stages(const ITensorInfo *input)
+unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
{
+ // We need only 1 stage for all axis except x-axis and x-axis for QASYMM8.
+ if(axis != 0 || (axis == 0 && is_data_type_quantized(input->data_type())))
+ {
+ return 1;
+ }
// Calculate number of WGs. 16 elements per thread, 8 threads per WG
const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
@@ -51,91 +56,132 @@ unsigned int calculate_number_of_stages(const ITensorInfo *input)
} // namespace
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+ : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
{
}
Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
- const unsigned int num_of_stages = calculate_number_of_stages(input);
-
- // Create temporary tensor infos
- auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
-
- // Create intermediate tensor info
- TensorShape shape{ input->tensor_shape() };
+ const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
- for(unsigned int i = 0; i < num_of_stages - 1; i++)
+ if(axis == 0 && !is_data_type_quantized(input->data_type()))
{
- shape.set(0, ceil(shape.x() / 128.f));
- sums_vector[i].set_data_type(input->data_type());
- sums_vector[i].set_tensor_shape(shape);
- sums_vector[i].set_num_channels(input->num_channels());
+ // Create temporary tensor infos
+ auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+
+ // Create intermediate tensor info
+ TensorShape shape{ input->tensor_shape() };
+
+ for(unsigned int i = 0; i < num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ sums_vector[i].set_data_type(input->data_type());
+ sums_vector[i].set_tensor_shape(shape);
+ sums_vector[i].set_num_channels(input->num_channels());
+ }
+
+ // Validate ReductionOperation only on first kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op));
+
+ // Validate ReductionOperation on intermediate stages
+ for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op));
+ }
+
+ // Validate ReductionOperation on the last stage
+ const unsigned int last_stage = num_of_stages - 1;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op));
}
-
- // Validate ReductionOperation only on first kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op));
-
- // Validate ReductionOperation on intermediate stages
- for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+ else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
}
- // Validate ReductionOperation on the last stage
- const unsigned int last_stage = num_of_stages - 1;
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op));
-
return Status{};
}
void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
{
- _num_of_stages = calculate_number_of_stages(input->info());
-
- // Create temporary tensors
- _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ _num_of_stages = calculate_number_of_stages(input->info(), axis);
+ _reduction_axis = axis;
+ _is_quantized = is_data_type_quantized(input->info()->data_type());
// Configure reduction operation kernels
_reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
- _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
- TensorShape shape{ input->info()->tensor_shape() };
- for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+ // Create temporary tensors
+ if(axis == 0 && !_is_quantized)
{
- shape.set(0, ceil(shape.x() / 128.f));
- _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+ _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
+ _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ TensorShape shape{ input->info()->tensor_shape() };
+ for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+ }
+
+ // Apply ReductionOperation only on first kernel
+ _memory_group.manage(_sums_vector.get());
+
+ ReductionOperation first_kernel_op;
+ ReductionOperation last_kernel_op;
+ switch(op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ first_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ last_kernel_op = ReductionOperation::SUM;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+
+ _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
+ _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+ // Apply ReductionOperation on intermediate stages
+ for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+ {
+ _memory_group.manage(_sums_vector.get() + i);
+ _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
+ _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _sums_vector[i - 1].allocator()->allocate();
+ }
+
+ // Apply ReductionOperation on the last stage
+ const unsigned int last_stage = _num_of_stages - 1;
+ const unsigned int input_width = input->info()->dimension(0);
+ _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+ _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _sums_vector[last_stage - 1].allocator()->allocate();
}
-
- // Apply ReductionOperation only on first kernel
- _memory_group.manage(_sums_vector.get());
- _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, op);
- _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
-
- // Apply ReductionOperation on intermediate stages
- for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+ else
{
- _memory_group.manage(_sums_vector.get() + i);
- _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
- _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[i - 1].allocator()->allocate();
+ _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
}
-
- // Apply ReductionOperation on the last stage
- const unsigned int last_stage = _num_of_stages - 1;
- _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, ReductionOperation::SUM);
- _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[last_stage - 1].allocator()->allocate();
}
void CLReductionOperation::run()
{
_memory_group.acquire();
- for(unsigned int i = 0; i < _num_of_stages; ++i)
+ if(_reduction_axis == 0 && !_is_quantized)
+ {
+ for(unsigned int i = 0; i < _num_of_stages; ++i)
+ {
+ CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+ CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+ }
+ }
+ else
{
- CLScheduler::get().enqueue(_border_handlers_vector[i], false);
- CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+ CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
}
_memory_group.release();