diff options
author | Michalis Spyrou <michalis.spyrou@arm.com> | 2018-01-26 15:06:19 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:44:23 +0000 |
commit | f6402dd37092c842d1de9998b23640caf12f227b (patch) | |
tree | 217284ffe083ea6af5cb5bc2e7f348090122ff16 /src | |
parent | bd0e61238b2126e990d7811750ad4511ec2ccbd1 (diff) | |
download | ComputeLibrary-f6402dd37092c842d1de9998b23640caf12f227b.tar.gz |
COMPMID-834 Fix arm_compute_nightly_validation getting killed
Changed CLReductionOperationKernel: Now each kernel computes
a 2D slice instead of 1D. This reduces the memory footprint
from around 1.6Gb for a 4k input image to a few Mb, which was
caused by the __local memory and was probably the cause for this bug.
Change-Id: I71ac71ff09b041c945a134177600f0f3475e48cf
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117835
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/core/CL/cl_kernels/reduction_operation.cl | 45 | ||||
-rw-r--r-- | src/core/CL/kernels/CLReductionOperationKernel.cpp | 14 |
2 files changed, 33 insertions, 26 deletions
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl index d46a22600f..aa7403b52b 100644 --- a/src/core/CL/cl_kernels/reduction_operation.cl +++ b/src/core/CL/cl_kernels/reduction_operation.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -70,39 +70,46 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[in] partial_sum_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt - * @param[in] partial_sum_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] partial_sum_stride_x Stride of the output tensor in X dimension (in bytes) * @param[in] partial_sum_step_x partial_sum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] partial_sum_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] partial_sum_step_y partial_sum_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] local_sums Local buffer for storing the partioal sum + * @param[in] local_sums Local buffer for storing the partial sum */ __kernel void reduction_operation( - VECTOR_DECLARATION(src), - VECTOR_DECLARATION(partial_sum), + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(partial_sum), __local DATA_TYPE *local_sums) { - Vector src = CONVERT_TO_VECTOR_STRUCT(src); - Vector partial_sum = CONVERT_TO_VECTOR_STRUCT(partial_sum); + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum); unsigned int lsize = get_local_size(0); unsigned int lid = get_local_id(0); - local_sums[lid] = OPERATION((__global DATA_TYPE *)src.ptr); - barrier(CLK_LOCAL_MEM_FENCE); - - // Perform parallel reduction - for(unsigned int i = lsize >> 1; i > 0; i >>= 1) + for(unsigned int y = 0; y < get_local_size(1); ++y) { - if(lid < i) + local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y)); + barrier(CLK_LOCAL_MEM_FENCE); + + // Perform parallel reduction + for(unsigned int i = lsize >> 1; i > 0; i >>= 1) { - local_sums[lid] += local_sums[lid + i]; + if(lid < i) + { + local_sums[lid] += local_sums[lid + i]; + } + barrier(CLK_LOCAL_MEM_FENCE); } - barrier(CLK_LOCAL_MEM_FENCE); - } - if(lid == 0) - { - ((__global DATA_TYPE *)partial_sum.ptr + get_group_id(0))[0] = local_sums[0]; + if(lid == 0) + { + ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0]; + } } }
\ No newline at end of file diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index 18a8e353d7..1dd5eb97ec 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -118,8 +118,8 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); // Get first input and output slices - Window in_slice = window.first_slice_window_1D(); - Window out_slice = out_window.first_slice_window_1D(); + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); // Reshape window const unsigned int border_width = ((in_slice.x().end() % 128) != 0) ? 128 - in_slice.x().end() % 128 : 0; @@ -127,14 +127,14 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que // Set local sums buffer unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size(); - _kernel.setArg(num_arguments_per_1D_tensor() * 2, local_sum_size, nullptr); + _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr); do { unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, in_slice); - add_1D_tensor_argument(idx, _output, out_slice); + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, _lws_hint); } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice)); + while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); } |