From b412fab0e3c8ec10e104f4d85760898a5b26179c Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Mon, 10 Dec 2018 17:40:23 +0000 Subject: COMPMID-1724: CL Implement Prod Change-Id: I17e51f25064b53a8f7e13d6fcbecc14a192de103 Reviewed-on: https://review.mlplatform.org/387 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins --- src/core/CL/kernels/CLReductionOperationKernel.cpp | 72 ++++++++++++---------- 1 file changed, 38 insertions(+), 34 deletions(-) (limited to 'src/core/CL/kernels/CLReductionOperationKernel.cpp') diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index 959209edc0..45aa810517 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -80,13 +80,13 @@ std::tuple validate_and_configure_window(ITensorInfo *input, ITe const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16; Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); bool window_changed = false; - const bool is_arg_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN); + const bool is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->data_type())); switch(axis) { case 0: { - if(is_data_type_quantized(input->data_type()) || is_arg_op) + if(is_serial_op) { AccessWindowHorizontal input_access(input, 0, input->dimension(0)); AccessWindowHorizontal output_access(output, 0, 1); @@ -153,10 +153,11 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou } build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted); - build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE="); + build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE"); build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN"); build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX"); build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN"); + build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD"); switch(op) { @@ -170,6 +171,9 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou case ReductionOperation::ARG_IDX_MAX: case ReductionOperation::ARG_IDX_MIN: break; + case ReductionOperation::PROD: + build_opts.add_option(("-DOPERATION=product")); + break; default: ARM_COMPUTE_ERROR("Unsupported reduction operation"); } @@ -177,12 +181,18 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou // Create kernel cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange(); std::string kernel_axis_name; - const bool is_arg_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN); + const bool is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->info()->data_type())); switch(axis) { case 0: { - if(!is_data_type_quantized(input->info()->data_type()) && !is_arg_op) + if(is_serial_op) + { + build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short"); + kernel_axis_name = "non_parallel_x"; + } + else { build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width)); const unsigned int width_leftover = input->info()->dimension(0) % border_val; @@ -195,12 +205,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou lws_hint = cl::NDRange(std::min(8U, num_of_threads)); _border_size = BorderSize(0, border_width, 0, 0); } - else - { - build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); - build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short"); - kernel_axis_name = "non_parallel_x"; - } } break; case 1: @@ -242,13 +246,31 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - const bool is_arg_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN); + const bool is_serial_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(_input->info()->data_type())); switch(_reduction_axis) { case 0: { // We use parallel reduction only in non quantized types - if(!is_data_type_quantized(_input->info()->data_type()) && !is_arg_op) + if(is_serial_op) + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + + Window in_slice = window.first_slice_window_1D(); + Window out_slice = window.first_slice_window_1D(); + + do + { + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, in_slice); + add_1D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice)); + } + else { // Set out window Window out_window(window); @@ -263,8 +285,8 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step())); // Set local sums buffer - unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size(); - _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr); + unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size(); + _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr); do { @@ -275,24 +297,6 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que } while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } - else - { - // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); - - Window in_slice = window.first_slice_window_1D(); - Window out_slice = window.first_slice_window_1D(); - - do - { - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, in_slice); - add_1D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice)); - } } break; case 1: -- cgit v1.2.1