aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-04-11 15:59:10 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:49:37 +0000
commite52a3000d2c13bc1b66ca66b3d12b6b836982394 (patch)
tree70e8ef5ba216762604f84228805aac9bd65747b6 /src/core/CL/kernels
parentdd03870b63784abe499761da2b26b209b33f2db2 (diff)
downloadComputeLibrary-e52a3000d2c13bc1b66ca66b3d12b6b836982394.tar.gz
COMPMID-1026 - Add support for 4x4 output tile in CLWinogradConvolutionLayer
The performance achieved can be found at the following confluence page: https://confluence.arm.com/display/MLENG/GEMM-based+convolution+vs+Winograd-based+convolution+on+OpenCL Change-Id: I4b690cfdd4eb4ff0cd17b14fdd49ccaa1d1dc85c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127729 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/CL/kernels')
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp19
-rw-r--r--src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp8
-rw-r--r--src/core/CL/kernels/CLWinogradInputTransformKernel.cpp19
-rw-r--r--src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp4
4 files changed, 26 insertions, 24 deletions
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 0a0de7ad4e..805a594af6 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -286,17 +286,23 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
else // The input tensors have not been reshaped
{
build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
// Create kernels according to the architecture, data type and input size.
if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && is_data_type_float(data_type))
{
- kernel_name = "gemm_mm_floating_point_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
- // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
- // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
- // FC6 and FC7 of AlexNet and VGG-16).
- if(input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1 && data_type == DataType::F32)
+ kernel_name = "gemm_mm_floating_point";
+
+ if(input0->info()->num_dimensions() != 1)
+ {
+ kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+ }
+ else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
{
- kernel_name += "_1000";
+ // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
+ // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
+ // FC6 and FC7 of AlexNet and VGG-16).
+ kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
}
// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
@@ -309,7 +315,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
}
else // (MIDGARD and F32) or (F16)
{
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
kernel_name = "gemm_mm_floating_point";
}
build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index d3a33c01f9..41b3ac50b5 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -55,9 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U));
- ARM_COMPUTE_RETURN_ERROR_ON(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U) && output_tile_size != Size2D(4U, 4U));
- ARM_COMPUTE_RETURN_ERROR_ON(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd filter transform only supports 3x3 and 5x5 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+ && output_tile_size != Size2D(4U, 4U),
+ "Winograd filter transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd filter transform only supports 4x4 output tile for 5x5 kernels");
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index a47590d20f..febd22b04e 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -47,7 +47,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
const Size2D kernel_size = winograd_info.kernel_size;
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd input transform only supports 3x3 and 5x5 kernels");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U), "Winograd input transform only supports 2x2 output tile for 3x3 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+ && output_tile_size != Size2D(4U, 4U),
+ "Winograd input transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd input transform only supports 4x4 output tile for 5x5 kernels");
ARM_COMPUTE_UNUSED(conv_info);
ARM_COMPUTE_UNUSED(output_tile_size);
@@ -111,7 +113,6 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
const int num_elements_y = input->info()->dimension(1) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
// Check if we need to extend the right or bottom border
- // FIXME: This actually is not needed. Added just for validating the result;
const unsigned int extra_border_right = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
const unsigned int extra_border_bottom = ((num_elements_y % output_tile_size.height) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.height - 1);
@@ -137,19 +138,13 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
// Check optimized kernel if output_dims == 2x2
- if(output_tile_size.width == 2 && output_tile_size.height == 2)
+ if(output_tile_size == Size2D(2U, 2U))
{
- if((_input->info()->dimension(2) % 2) != 0)
- {
- _step_z = 1;
- }
- else
- {
- _step_z = 2;
- _lws_hint = cl::NDRange(1, 1, 8);
- }
+ _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
}
+ _lws_hint = cl::NDRange(1, 1, 8);
+
// Append stepz and data layout
kernel_name += "_stepz";
kernel_name += support::cpp11::to_string(_step_z);
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 8ee1a82209..c5d2528aa2 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -58,6 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Only 3x3 and 5x5 kernels are supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(2U, 2U) && input->dimension(2) != 16, "Wrong number of batches");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 36, "Wrong number of batches");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 64, "Wrong number of batches");
// Compute number of elements to process in the X and Y direction
@@ -67,7 +68,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
const int num_tiles_y = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles_x * num_tiles_y)));
- ARM_COMPUTE_UNUSED(output_tile_size);
if(bias != nullptr)
{
@@ -207,4 +207,4 @@ void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue
enqueue(queue, *this, slice, _lws_hint);
}
while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
-} \ No newline at end of file
+}