From e52a3000d2c13bc1b66ca66b3d12b6b836982394 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 11 Apr 2018 15:59:10 +0100 Subject: COMPMID-1026 - Add support for 4x4 output tile in CLWinogradConvolutionLayer The performance achieved can be found at the following confluence page: https://confluence.arm.com/display/MLENG/GEMM-based+convolution+vs+Winograd-based+convolution+on+OpenCL Change-Id: I4b690cfdd4eb4ff0cd17b14fdd49ccaa1d1dc85c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127729 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp') diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp index 8ee1a82209..c5d2528aa2 100644 --- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp @@ -58,6 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Only 3x3 and 5x5 kernels are supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(2U, 2U) && input->dimension(2) != 16, "Wrong number of batches"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 36, "Wrong number of batches"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 64, "Wrong number of batches"); // Compute number of elements to process in the X and Y direction @@ -67,7 +68,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con const int num_tiles_y = std::ceil(num_elements_y / static_cast(output_tile_size.height)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast((num_tiles_x * num_tiles_y))); - ARM_COMPUTE_UNUSED(output_tile_size); if(bias != nullptr) { @@ -207,4 +207,4 @@ void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue enqueue(queue, *this, slice, _lws_hint); } while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); -} \ No newline at end of file +} -- cgit v1.2.1