aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels/CLPermuteKernel.cpp
diff options
context:
space:
mode:
authorDiego Lopez Recas <Diego.LopezRecas@arm.com>2017-12-18 14:42:56 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:45:00 +0000
commit0021d750d66d199c411df00cdd8308c325f1fef3 (patch)
treeb96e618977442a8aab335c136d369a958998d416 /src/core/CL/kernels/CLPermuteKernel.cpp
parent5b6904b8d9cb5e8a343cde96fd5a8701f44dff90 (diff)
downloadComputeLibrary-0021d750d66d199c411df00cdd8308c325f1fef3.tar.gz
IVGCVSW-863 Broadcast support in CL/NEON Arithmetic Add
Also, added instrumentation to support generic tensor broadcasting for NEON and CL backends. Change-Id: I1bc5747a286e1a4b464c209067581e103d473b9a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114201 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLPermuteKernel.cpp')
-rw-r--r--src/core/CL/kernels/CLPermuteKernel.cpp14
1 files changed, 6 insertions, 8 deletions
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index 132de60b68..1f36445732 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -106,10 +106,10 @@ void CLPermuteKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
- Window slice_in = window.first_slice_window_4D();
- Window slice_out(slice_in);
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
// Setup output slice
+ Window slice_out(slice_in);
slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
@@ -117,12 +117,10 @@ void CLPermuteKernel::run(const Window &window, cl::CommandQueue &queue)
do
{
- auto collapsed_slice_in = slice_in.collapse(ICLKernel::window(), 2);
- auto collapsed_slice_out = slice_out.collapse(ICLKernel::window(), 2);
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, collapsed_slice_in);
- add_4D_tensor_argument(idx, _output, collapsed_slice_out);
- enqueue(queue, *this, collapsed_slice_in);
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
}
while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
}