aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels/CLCol2ImKernel.cpp
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2018-02-07 23:13:06 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:46:07 +0000
commit54f18c4a7a20ff697dc1ba66a73e9d622a407d02 (patch)
tree782aa3f5054bfbf875a99b3a6df96d5396ec2b64 /src/core/CL/kernels/CLCol2ImKernel.cpp
parente9146ed3b4ad8501cb17dfe5953ef0259f106c2e (diff)
downloadComputeLibrary-54f18c4a7a20ff697dc1ba66a73e9d622a407d02.tar.gz
COMPMID-901 - Optimizing CLCol2ImKernel
This patch makes col2im on OpenCL 2 times faster Change-Id: I8d90f5a72a050355ca1fd13433d8c2c26e5e33f5 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/119442 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLCol2ImKernel.cpp')
-rw-r--r--src/core/CL/kernels/CLCol2ImKernel.cpp13
1 files changed, 10 insertions, 3 deletions
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 499e1e8fe0..c8005ec0f6 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -67,6 +67,8 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p
// Create kernel
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
+ build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
@@ -87,10 +89,15 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p
}
}
+ const unsigned int num_elems_read_per_iteration = is_data_type_fixed_point(data_type) ? 1 : 8;
+
// Configure window
- Window win = calculate_max_window(*input->info(), Steps());
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_read_per_iteration));
+
+ // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_read_per_iteration);
+ update_window_and_padding(win, input_access);
- // The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
Coordinates coord;
coord.set_num_dimensions(output->info()->num_dimensions());
output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));