From b412fab0e3c8ec10e104f4d85760898a5b26179c Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Mon, 10 Dec 2018 17:40:23 +0000 Subject: COMPMID-1724: CL Implement Prod Change-Id: I17e51f25064b53a8f7e13d6fcbecc14a192de103 Reviewed-on: https://review.mlplatform.org/387 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/reduction_operation.cl | 68 ++++++++++---- src/core/CL/kernels/CLFillBorderKernel.cpp | 28 +++--- src/core/CL/kernels/CLReductionOperationKernel.cpp | 72 ++++++++------- src/core/NEON/kernels/NEFillBorderKernel.cpp | 2 +- src/runtime/CL/functions/CLReductionOperation.cpp | 100 ++++++++++++--------- src/runtime/NEON/functions/NEIntegralImage.cpp | 4 +- src/runtime/NEON/functions/NEScale.cpp | 4 +- 7 files changed, 166 insertions(+), 112 deletions(-) (limited to 'src') diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl index d1f47beda7..b4ede25296 100644 --- a/src/core/CL/cl_kernels/reduction_operation.cl +++ b/src/core/CL/cl_kernels/reduction_operation.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -60,12 +60,31 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) return (in.s0 + in.s1); } + +/** Calculate product of a vector + * + * @param[in] input Pointer to the first pixel. + * + * @return product of vector. + */ +inline DATA_TYPE product(__global const DATA_TYPE *input) +{ + VEC_DATA_TYPE(DATA_TYPE, 16) + in = vload16(0, input); + + in.s01234567 *= in.s89ABCDEF; + in.s0123 *= in.s4567; + in.s01 *= in.s23; + + return (in.s0 * in.s1); +} #if defined(OPERATION) /** This kernel performs parallel reduction given an operation on x-axis. * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value + * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 @@ -74,28 +93,28 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] partial_sum_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt - * @param[in] partial_sum_stride_x Stride of the output tensor in X dimension (in bytes) - * @param[in] partial_sum_step_x partial_sum_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] partial_sum_stride_y Stride of the output tensor in Y dimension (in bytes) - * @param[in] partial_sum_step_y partial_sum_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] local_sums Local buffer for storing the partial sum + * @param[in] partial_res_ptr The local buffer to hold partial result values. Supported data types: same as @p src_ptr + * @param[in] partial_res_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] partial_res_step_x partial_res_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] partial_res_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] partial_res_step_y partial_res_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] local_results Local buffer for storing the partial result */ __kernel void reduction_operation_x( IMAGE_DECLARATION(src), - IMAGE_DECLARATION(partial_sum), - __local DATA_TYPE *local_sums) + IMAGE_DECLARATION(partial_res), + __local DATA_TYPE *local_results) { Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum); + Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res); unsigned int lsize = get_local_size(0); unsigned int lid = get_local_id(0); for(unsigned int y = 0; y < get_local_size(1); ++y) { - local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y)); + local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y)); barrier(CLK_LOCAL_MEM_FENCE); // Perform parallel reduction @@ -103,7 +122,11 @@ __kernel void reduction_operation_x( { if(lid < i) { - local_sums[lid] += local_sums[lid + i]; +#if defined(PROD) + local_results[lid] *= local_results[lid + i]; +#else //!defined(PROD) + local_results[lid] += local_results[lid + i]; +#endif //defined(PROD) } barrier(CLK_LOCAL_MEM_FENCE); } @@ -113,10 +136,10 @@ __kernel void reduction_operation_x( #if defined(MEAN) && defined(WIDTH) if(y == get_local_size(1) - 1) { - local_sums[0] /= WIDTH; + local_results[0] /= WIDTH; } #endif /* defined(MEAN) && defined(WIDTH) */ - ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0]; + ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0]; } } } @@ -127,6 +150,7 @@ __kernel void reduction_operation_x( * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 + * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used * @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN @@ -230,7 +254,11 @@ __kernel void reduction_operation_y( #if defined(SUM_SQUARE) in *= in; #endif // defined(SUM_SQUARE) +#if defined(PROD) + res *= in; +#else //!defined(PROD) res += in; +#endif //defined(PROD) #endif // defined(ARG_MAX) || defined(ARG_MIN) } @@ -304,7 +332,11 @@ __kernel void reduction_operation_z( #if defined(SUM_SQUARE) in *= in; #endif // defined(SUM_SQUARE) +#if defined(PROD) + res *= in; +#else //!defined(PROD) res += in; +#endif //defined(PROD) #endif // defined(ARG_MAX) || defined(ARG_MIN) } @@ -383,7 +415,11 @@ __kernel void reduction_operation_w( #if defined(SUM_SQUARE) in *= in; #endif // defined(SUM_SQUARE) +#if defined(PROD) + res *= in; +#else //!defined(PROD) res += in; +#endif //defined(PROD) #endif // defined(ARG_MAX) || defined(ARG_MIN) } @@ -397,4 +433,4 @@ __kernel void reduction_operation_w( vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); #endif // defined(ARG_MAX) || defined(ARG_MIN) } -#endif /* defined(BATCH) && defined(DEPTH) */ \ No newline at end of file +#endif /* defined(BATCH) && defined(DEPTH) */ diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp index 69206678d0..5fdb826f8b 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.cpp +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -75,25 +75,18 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo // Select appropriate kernel std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode)); - // Define select type required by replicate border > 1 - const DataType dt = tensor->info()->data_type(); - std::string select_type = get_underlying_cl_type_from_data_type(dt); - if(is_data_type_float(dt)) - { - select_type = (DataType::F32 == dt) ? "int" : "short"; - } + const DataType dt = tensor->info()->data_type(); // Define build options - std::set build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt))); - build_opts.emplace(("-DSELECT_TYPE=" + select_type)); - build_opts.emplace(("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top))); - build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom))); - build_opts.emplace(("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left))); - build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right))); + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt)); + build_opts.add_option("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top)); + build_opts.add_option("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom)); + build_opts.add_option("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left)); + build_opts.add_option("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right)); // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); _tensor = tensor; // Create static kernel arguments @@ -141,8 +134,9 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo set_constant_border(idx, constant_border_value); break; case DataType::F16: + static_assert(sizeof(cl_half) == sizeof(half), "Half must be same size as cl_half"); static_assert(sizeof(cl_half) == 2, "Half must be 16 bit"); - set_constant_border(idx, constant_border_value); + set_constant_border(idx, constant_border_value); break; default: ARM_COMPUTE_ERROR("Not handled"); diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index 959209edc0..45aa810517 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -80,13 +80,13 @@ std::tuple validate_and_configure_window(ITensorInfo *input, ITe const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16; Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); bool window_changed = false; - const bool is_arg_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN); + const bool is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->data_type())); switch(axis) { case 0: { - if(is_data_type_quantized(input->data_type()) || is_arg_op) + if(is_serial_op) { AccessWindowHorizontal input_access(input, 0, input->dimension(0)); AccessWindowHorizontal output_access(output, 0, 1); @@ -153,10 +153,11 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou } build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted); - build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE="); + build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE"); build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN"); build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX"); build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN"); + build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD"); switch(op) { @@ -170,6 +171,9 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou case ReductionOperation::ARG_IDX_MAX: case ReductionOperation::ARG_IDX_MIN: break; + case ReductionOperation::PROD: + build_opts.add_option(("-DOPERATION=product")); + break; default: ARM_COMPUTE_ERROR("Unsupported reduction operation"); } @@ -177,12 +181,18 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou // Create kernel cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange(); std::string kernel_axis_name; - const bool is_arg_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN); + const bool is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->info()->data_type())); switch(axis) { case 0: { - if(!is_data_type_quantized(input->info()->data_type()) && !is_arg_op) + if(is_serial_op) + { + build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short"); + kernel_axis_name = "non_parallel_x"; + } + else { build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width)); const unsigned int width_leftover = input->info()->dimension(0) % border_val; @@ -195,12 +205,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou lws_hint = cl::NDRange(std::min(8U, num_of_threads)); _border_size = BorderSize(0, border_width, 0, 0); } - else - { - build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); - build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short"); - kernel_axis_name = "non_parallel_x"; - } } break; case 1: @@ -242,13 +246,31 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - const bool is_arg_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN); + const bool is_serial_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(_input->info()->data_type())); switch(_reduction_axis) { case 0: { // We use parallel reduction only in non quantized types - if(!is_data_type_quantized(_input->info()->data_type()) && !is_arg_op) + if(is_serial_op) + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + + Window in_slice = window.first_slice_window_1D(); + Window out_slice = window.first_slice_window_1D(); + + do + { + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, in_slice); + add_1D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice)); + } + else { // Set out window Window out_window(window); @@ -263,8 +285,8 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step())); // Set local sums buffer - unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size(); - _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr); + unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size(); + _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr); do { @@ -275,24 +297,6 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que } while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } - else - { - // Get first input and output slices - Window window_in{ window }; - window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); - - Window in_slice = window.first_slice_window_1D(); - Window out_slice = window.first_slice_window_1D(); - - do - { - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, in_slice); - add_1D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice); - } - while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice)); - } } break; case 1: diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp index 39bcd996f9..f4046e0851 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.cpp +++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index c5447ffd6b..e2dec6b375 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -56,15 +56,19 @@ unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int a } // namespace CLReductionOperation::CLReductionOperation(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized() + : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial() { } Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) { const unsigned int num_of_stages = calculate_number_of_stages(input, axis); - - if(axis == 0 && !is_data_type_quantized(input->data_type())) + bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0; + if(is_serial) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op)); + } + else { // Create temporary tensor infos auto sums_vector = arm_compute::support::cpp14::make_unique(num_of_stages - 1); @@ -81,17 +85,25 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf } ReductionOperation first_kernel_op; + ReductionOperation intermediate_kernel_op; ReductionOperation last_kernel_op; switch(op) { case ReductionOperation::SUM: case ReductionOperation::MEAN_SUM: - first_kernel_op = ReductionOperation::SUM; - last_kernel_op = op; + first_kernel_op = ReductionOperation::SUM; + intermediate_kernel_op = ReductionOperation::SUM; + last_kernel_op = op; break; case ReductionOperation::SUM_SQUARE: - first_kernel_op = ReductionOperation::SUM_SQUARE; - last_kernel_op = ReductionOperation::SUM; + first_kernel_op = ReductionOperation::SUM_SQUARE; + intermediate_kernel_op = ReductionOperation::SUM; + last_kernel_op = ReductionOperation::SUM; + break; + case ReductionOperation::PROD: + first_kernel_op = ReductionOperation::PROD; + intermediate_kernel_op = ReductionOperation::PROD; + last_kernel_op = ReductionOperation::PROD; break; default: ARM_COMPUTE_ERROR("Not supported"); @@ -103,17 +115,13 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf // Validate ReductionOperation on intermediate stages for(unsigned int i = 1; i < num_of_stages - 1; ++i) { - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM)); + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op)); } // Validate ReductionOperation on the last stage const unsigned int last_stage = num_of_stages - 1; ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0))); } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op)); - } return Status{}; } @@ -122,65 +130,77 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign { _num_of_stages = calculate_number_of_stages(input->info(), axis); _reduction_axis = axis; - _is_quantized = is_data_type_quantized(input->info()->data_type()); + _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0; // Configure reduction operation kernels _reduction_kernels_vector = arm_compute::support::cpp14::make_unique(_num_of_stages); // Create temporary tensors - if(axis == 0 && !_is_quantized) + if(_is_serial) + { + _reduction_kernels_vector[0].configure(input, output, axis, op, 0); + } + else { _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_of_stages); - _sums_vector = arm_compute::support::cpp14::make_unique(_num_of_stages - 1); + _results_vector = arm_compute::support::cpp14::make_unique(_num_of_stages - 1); TensorShape shape{ input->info()->tensor_shape() }; for(unsigned int i = 0; i < _num_of_stages - 1; i++) { shape.set(0, ceil(shape.x() / 128.f)); - _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape)); + _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape)); } // Apply ReductionOperation only on first kernel - _memory_group.manage(_sums_vector.get()); + _memory_group.manage(_results_vector.get()); ReductionOperation first_kernel_op; + ReductionOperation intermediate_kernel_op; ReductionOperation last_kernel_op; + PixelValue pixelValue; switch(op) { case ReductionOperation::SUM: case ReductionOperation::MEAN_SUM: - first_kernel_op = ReductionOperation::SUM; - last_kernel_op = op; + first_kernel_op = ReductionOperation::SUM; + intermediate_kernel_op = ReductionOperation::SUM; + last_kernel_op = op; + pixelValue = PixelValue(0); break; case ReductionOperation::SUM_SQUARE: - first_kernel_op = ReductionOperation::SUM_SQUARE; - last_kernel_op = ReductionOperation::SUM; + first_kernel_op = ReductionOperation::SUM_SQUARE; + intermediate_kernel_op = ReductionOperation::SUM; + last_kernel_op = ReductionOperation::SUM; + pixelValue = PixelValue(0); + break; + case ReductionOperation::PROD: + first_kernel_op = ReductionOperation::PROD; + intermediate_kernel_op = ReductionOperation::PROD; + last_kernel_op = ReductionOperation::PROD; + pixelValue = PixelValue(1, input->info()->data_type()); break; default: ARM_COMPUTE_ERROR("Not supported"); } - _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op); - _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0)); + _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op); + _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue); // Apply ReductionOperation on intermediate stages for(unsigned int i = 1; i < _num_of_stages - 1; ++i) { - _memory_group.manage(_sums_vector.get() + i); - _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM); - _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0)); - _sums_vector[i - 1].allocator()->allocate(); + _memory_group.manage(_results_vector.get() + i); + _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op); + _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue); + _results_vector[i - 1].allocator()->allocate(); } // Apply ReductionOperation on the last stage const unsigned int last_stage = _num_of_stages - 1; const unsigned int input_width = input->info()->dimension(0); - _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width); - _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0)); - _sums_vector[last_stage - 1].allocator()->allocate(); - } - else - { - _reduction_kernels_vector[0].configure(input, output, axis, op, 0); + _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width); + _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue); + _results_vector[last_stage - 1].allocator()->allocate(); } } @@ -188,7 +208,11 @@ void CLReductionOperation::run() { _memory_group.acquire(); - if(_reduction_axis == 0 && !_is_quantized) + if(_is_serial) + { + CLScheduler::get().enqueue(_reduction_kernels_vector[0], false); + } + else { for(unsigned int i = 0; i < _num_of_stages; ++i) { @@ -196,10 +220,6 @@ void CLReductionOperation::run() CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); } } - else - { - CLScheduler::get().enqueue(_reduction_kernels_vector[0], false); - } _memory_group.release(); } diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp index fa8aaeb5dd..43308fa169 100644 --- a/src/runtime/NEON/functions/NEIntegralImage.cpp +++ b/src/runtime/NEON/functions/NEIntegralImage.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -36,5 +36,5 @@ void NEIntegralImage::configure(const ITensor *input, ITensor *output) auto k = arm_compute::support::cpp14::make_unique(); k->configure(input, output); _kernel = std::move(k); - _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, static_cast(0.f)); + _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0)); } diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp index a9c85bd726..169b9bbf6a 100644 --- a/src/runtime/NEON/functions/NEScale.cpp +++ b/src/runtime/NEON/functions/NEScale.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -167,7 +167,7 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } - _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value); } Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, -- cgit v1.2.1