aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-06-04 19:27:13 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:52:54 +0000
commit17812ba9f7cf2c8f5121c11760ac45fbbdb7aeaf (patch)
tree28c7bb65a8306e82de91a644fdcc1c0947c6f6d7 /src/core/CL
parentf8d8f3aff04faf731f20411ecb91027eab4365c5 (diff)
downloadComputeLibrary-17812ba9f7cf2c8f5121c11760ac45fbbdb7aeaf.tar.gz
COMPMID-817: Tuner: Port kernels to new design.
Change-Id: Iaabb1153c2abe0400ec79d51a21347debe92d642 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134062 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/CL')
-rw-r--r--src/core/CL/kernels/CLCol2ImKernel.cpp15
-rw-r--r--src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp9
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp48
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp8
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp6
-rw-r--r--src/core/CL/kernels/CLPoolingLayerKernel.cpp12
6 files changed, 7 insertions, 91 deletions
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index e15da7258a..4e444206f1 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -110,21 +110,6 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts.options()));
- // Configure the local work size for Bifrost with a value obtained
- // via exhaustive autotuning over 30 representative tensor shapes.
- const GPUTarget gpu_target = get_target();
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
- {
- if((_convolved_dims.first == 7) || (_convolved_dims.first == 14))
- {
- _lws_hint = cl::NDRange(1, 7, 1);
- }
- else
- {
- _lws_hint = cl::NDRange(1, 8, 1);
- }
- }
-
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info(), _convolved_dims);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index 41ff2202ca..c89b16eedc 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
@@ -90,15 +90,6 @@ void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *outpu
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_im2col", build_opts.options()));
- // Configure the local work size for Bifrost with a value obtained
- // via exhaustive autotuning for the MobileNets tensor shapes.
- const GPUTarget gpu_target = get_target();
-
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
- {
- _lws_hint = cl::NDRange(1, 2, 1);
- }
-
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
// CLDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 7a9760b778..fc52f4e124 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -194,51 +194,9 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
_output = output;
_slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions();
- const DataType data_type = input0->info()->data_type();
- const int fp_pos = input0->info()->fixed_point_position();
-
- // Get target architecture
- GPUTarget gpu_target = get_target();
-
- // Configure LWS hint
- switch(gpu_target)
- {
- case GPUTarget::MIDGARD:
- case GPUTarget::T600:
- case GPUTarget::T700:
- case GPUTarget::T800:
- if(output->info()->dimension(1) == 196)
- {
- _lws_hint = cl::NDRange(1, 7);
- }
- else
- {
- _lws_hint = cl::NDRange(8, 8);
- }
- break;
- case GPUTarget::G71:
- case GPUTarget::G72:
- case GPUTarget::G51:
- case GPUTarget::G51BIG:
- case GPUTarget::G51LIT:
- case GPUTarget::TNOX:
- if(input1->info()->dimension(1) == 24)
- {
- // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
- _lws_hint = cl::NDRange(2, 2);
- }
- else if(output->info()->dimension(1) == 196)
- {
- _lws_hint = cl::NDRange(1, 7);
- }
- else
- {
- _lws_hint = cl::NDRange(8, 8);
- }
- break;
- default:
- _lws_hint = cl::NullRange;
- }
+ const DataType data_type = input0->info()->data_type();
+ const int fp_pos = input0->info()->fixed_point_position();
+ const GPUTarget gpu_target = get_target();
ElementsProcessed num_elements_processed{};
diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index 1d6f388def..d8ecd501b0 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
@@ -110,14 +110,6 @@ void CLGEMMMatrixVectorMultiplyKernel::configure(const ICLTensor *input0, const
_kernel.setArg<int>(idx++, -_input1->info()->quantization_info().offset);
}
- // Configure the local work size for Bifrost with a value obtained
- // via exhaustive autotuning for the MobileNets tensor shapes.
- const GPUTarget gpu_target = get_target();
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
- {
- _lws_hint = cl::NDRange(1, 1, 1);
- }
-
// Configure kernel window
const unsigned int num_elems_read_per_iteration = 4;
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 378456cde6..53a4dca9a3 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -61,7 +61,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, b
} // namespace
CLIm2ColKernel::CLIm2ColKernel()
- : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr), _kernel_dims()
+ : _input(nullptr), _output(nullptr), _conv_info(), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr), _kernel_dims()
{
}
@@ -74,6 +74,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
_input = input;
_output = output;
+ _conv_info = conv_info;
_kernel_dims = kernel_dims;
const DataType data_type = input->info()->data_type();
@@ -190,10 +191,9 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
{
vector_size = kernel_dims.width;
}
- // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
+ // Vector size optimized for the 11x11 AlexNet convolution on Bifrost.
if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && kernel_dims.width == 11)
{
- _lws_hint = cl::NDRange(1, 1, 1);
vector_size = 8;
}
const size_t width_mod_vector_size = kernel_dims.width % vector_size;
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 3091df4665..b242c5550c 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -208,8 +208,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
_output = output;
_pool_info = pool_info;
- const GPUTarget gpu_target = get_target();
- const DataType data_type = input->info()->data_type();
+ const DataType data_type = input->info()->data_type();
// Set build options
CLBuildOptions build_opts;
@@ -273,20 +272,11 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
ICLKernel::configure(std::get<1>(win_config));
- // Configure the local work size (hint) from the first two dimensions of the global work size.
- // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
- // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
- // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
if(data_layout == DataLayout::NCHW)
{
CLPoolingConfig pooling_config = std::get<2>(win_config);
_num_elems_processed_per_iteration = pooling_config.first;
_border_size = pooling_config.second;
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
- {
- cl::NDRange gws = ICLKernel::gws_from_window(std::get<1>(win_config));
- _lws_hint = cl::NDRange(gws[0], gws[1], 1);
- }
}
else
{