From 7e9391bb14d219cda310bff355669b5964b1f576 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Fri, 5 Oct 2018 14:49:28 +0100 Subject: COMPMID-1574 Implement ReduceMean in OpenCL Change-Id: Id331199f569f52a37280a9ada5bf84694580b93c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/152843 Tested-by: bsgcomp Reviewed-by: Michele DiGiorgio --- src/core/CL/CLKernelLibrary.cpp | 6 +- src/core/CL/cl_kernels/reduction_operation.cl | 196 ++++++++++++++- src/core/CL/kernels/CLReductionOperationKernel.cpp | 270 ++++++++++++++++----- 3 files changed, 406 insertions(+), 66 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index ce4b85551d..957543c877 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -342,7 +342,11 @@ const std::map CLKernelLibrary::_kernel_program_map = { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" }, { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" }, { "quantization_layer", "quantization_layer.cl" }, - { "reduction_operation", "reduction_operation.cl" }, + { "reduction_operation_x", "reduction_operation.cl" }, + { "reduction_operation_quantized_x", "reduction_operation.cl" }, + { "reduction_operation_y", "reduction_operation.cl" }, + { "reduction_operation_z", "reduction_operation.cl" }, + { "reduction_operation_w", "reduction_operation.cl" }, { "remap_nearest_neighbour", "remap.cl" }, { "remap_bilinear", "remap.cl" }, { "reorg_layer_nchw", "reorg_layer.cl" }, diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl index aa7403b52b..c1be4472a7 100644 --- a/src/core/CL/cl_kernels/reduction_operation.cl +++ b/src/core/CL/cl_kernels/reduction_operation.cl @@ -61,13 +61,14 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) return (in.s0 + in.s1); } -/** This kernel performs reduction given an operation. +/** This kernel performs parallel reduction given an operation on x-axis. * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum + * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value + * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -81,7 +82,7 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[in] local_sums Local buffer for storing the partial sum */ -__kernel void reduction_operation( +__kernel void reduction_operation_x( IMAGE_DECLARATION(src), IMAGE_DECLARATION(partial_sum), __local DATA_TYPE *local_sums) @@ -109,7 +110,192 @@ __kernel void reduction_operation( if(lid == 0) { +#if defined(MEAN) && defined(WIDTH) + if(y == get_local_size(1) - 1) + { + local_sums[0] /= WIDTH; + } +#endif /* defined(MEAN) && defined(WIDTH) */ ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0]; } } -} \ No newline at end of file +} + +#if defined(WIDTH) +/** This kernel performs reduction on x-axis. (QASYMM8) + * + * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_quantized_x( + VECTOR_DECLARATION(src), + VECTOR_DECLARATION(output)) +{ + Vector src = CONVERT_TO_VECTOR_STRUCT(src); + Vector output = CONVERT_TO_VECTOR_STRUCT(output); + + uint res = 0; + + for(unsigned int x = 0; x < WIDTH; ++x) + { + res += *((__global uchar *)vector_offset(&src, x)); + } + +#if defined(MEAN) + res /= WIDTH; +#endif /* defined(MEAN) */ + + // Store result + *((__global uchar *)output.ptr) = convert_uchar(res); +} +#endif /* defined(HEIGHT) */ + +#if defined(HEIGHT) +/** This kernel performs reduction on y-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_y( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(output)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + res = 0; + + for(unsigned int y = 0; y < HEIGHT; ++y) + { + res += CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + } + +#if defined(MEAN) + res /= HEIGHT; +#endif /* defined(MEAN) */ + + // Store result + vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif /* defined(HEIGHT) */ + +#if defined(DEPTH) +/** This kernel performs reduction on z-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_z( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + res = 0; + + for(unsigned int z = 0; z < DEPTH; ++z) + { + res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + } + +#if defined(MEAN) + res /= DEPTH; +#endif /* defined(MEAN) */ + + // Store result + vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif /* defined(DEPTH) */ + +#if defined(BATCH) && defined(DEPTH) +/** This kernel performs reduction on w-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128 + * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_w( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) +{ + Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + res = 0; + + for(unsigned int w = 0; w < BATCH; ++w) + { + res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + } + +#if defined(MEAN) + res /= BATCH; +#endif /* defined(MEAN) */ + + // Store result + vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif /* defined(BATCH) && defined(DEPTH) */ \ No newline at end of file diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index bf36ae2c0f..d4165ccd4e 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -39,24 +39,22 @@ using namespace arm_compute; namespace { -// OpenCL kernel requires input width to be a power of 2. +// OpenCL kernel requires input width to be a power of 2 for x-axis. constexpr unsigned int border_val = 64; -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width) { - ARM_COMPUTE_UNUSED(op); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); - + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && axis != 0, "Not supported reduction operation for this axis"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8); if(output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); } return Status{}; @@ -69,16 +67,44 @@ std::tuple validate_and_configure_window(ITensorInfo *input, ITe output_shape.set(axis, 1); auto_init_if_empty(*output, output_shape, 1, input->data_type()); - const unsigned int num_elems_processed_per_iteration = 16; - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0; + const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16; + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + bool window_changed = false; - AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1); - AccessWindowHorizontal output_access(output, 0, 1); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, output->valid_region()); + switch(axis) + { + case 0: + { + if(is_data_type_quantized(input->data_type())) + { + AccessWindowHorizontal input_access(input, 0, input->dimension(0)); + AccessWindowHorizontal output_access(output, 0, 1); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + else + { + const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0; + AccessWindowStatic input_access(input, 0, 0, input->dimension(0) + border_width, 1); + AccessWindowHorizontal output_access(output, 0, 1); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + } + break; + case 1: + case 2: + case 3: + { + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -96,46 +122,85 @@ BorderSize CLReductionOperationKernel::border_size() const return _border_size; } -void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op) +void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - - const unsigned int num_elems_processed_per_iteration = 16; - const unsigned int width_leftover = input->info()->dimension(0) % border_val; - const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0; - const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op, width)); _input = input; _output = output; _reduction_axis = axis; _op = op; - // Set the number of WG based on the input size. If input width is < 128 - // we can use fewer threads than 8. - cl::NDRange lws_hint = cl::NDRange(std::min(8U, num_of_threads)); - _border_size = BorderSize(0, border_width, 0, 0); - // Set build options - std::set build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + CLBuildOptions build_opts; + std::string data_type_promoted = get_cl_type_from_data_type(input->info()->data_type()); + if(is_data_type_quantized(input->info()->data_type()) && axis != 0) + { + data_type_promoted = "uint"; + } + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted); + build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN"); switch(op) { case ReductionOperation::SUM_SQUARE: - build_opts.emplace(("-DOPERATION=square_sum")); + build_opts.add_option(("-DOPERATION=square_sum")); break; case ReductionOperation::SUM: - build_opts.emplace(("-DOPERATION=sum")); + case ReductionOperation::MEAN_SUM: + build_opts.add_option(("-DOPERATION=sum")); break; default: ARM_COMPUTE_ERROR("Unsupported reduction operation"); } // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel("reduction_operation", build_opts)); + cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange(); + std::string kernel_axis_name; + switch(axis) + { + case 0: + { + if(!is_data_type_quantized(input->info()->data_type())) + { + build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width)); + const unsigned int width_leftover = input->info()->dimension(0) % border_val; + const unsigned int border_width = (width_leftover != 0) ? border_val - width_leftover : 0; + const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16); + kernel_axis_name = "x"; + + // Set the number of WG based on the input size. If input width is < 128 + // we can use fewer threads than 8. + lws_hint = cl::NDRange(std::min(8U, num_of_threads)); + _border_size = BorderSize(0, border_width, 0, 0); + } + else + { + build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); + kernel_axis_name = "quantized_x"; + } + } + break; + case 1: + build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); + kernel_axis_name = "y"; + break; + case 2: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + kernel_axis_name = "z"; + break; + case 3: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3))); + kernel_axis_name = "w"; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + _kernel = static_cast(CLKernelLibrary::get().create_kernel("reduction_operation_" + kernel_axis_name, build_opts.options())); // Configure kernel window auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis); @@ -145,9 +210,9 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou ICLKernel::configure_internal(std::get<1>(win_config), lws_hint); } -Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op, width)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis))); return Status{}; @@ -158,28 +223,113 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - // Set out window - Window out_window(window); - out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - - // Get first input and output slices - Window in_slice = window.first_slice_window_2D(); - Window out_slice = out_window.first_slice_window_2D(); - - // Reshape window - const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0; - in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step())); - - // Set local sums buffer - unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size(); - _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr); - - do + switch(_reduction_axis) { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - add_2D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice, lws_hint()); + case 0: + { + // We use parallel reduction only in non quantized types + if(!is_data_type_quantized(_input->info()->data_type())) + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + // Get first input and output slices + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + // Reshape window + const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0; + in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step())); + + // Set local sums buffer + unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size(); + _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } + while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } + else + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + + Window in_slice = window.first_slice_window_1D(); + Window out_slice = window.first_slice_window_1D(); + + do + { + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, in_slice); + add_1D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice)); + } + } + break; + case 1: + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); + Window in_slice = window_in.first_slice_window_2D(); + Window out_slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } + break; + case 2: + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); + Window in_slice = window_in.first_slice_window_3D(); + Window out_slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_3D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); + } + break; + case 3: + { + // Get first input and output slices + Window window_in{ window }; + window_in.set(3, Window::Dimension(0, 1, 1)); + Window in_slice = window_in.first_slice_window_4D(); + Window out_slice = window.first_slice_window_4D(); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, in_slice); + add_4D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } + while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); } - while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } -- cgit v1.2.1