From a25d16c86f0d870408bc8b941aa755093417b0f0 Mon Sep 17 00:00:00 2001
From: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Date: Fri, 16 Nov 2018 11:33:12 +0000
Subject: COMPMID-1266 : Add support for FP16 in CLWinogradConvolutionLayer:
 5x5 kernels

Introduced F32 accumulation for F16 winograd gemm and output transform
WinogradConvolution will be available for F16 only if fast math flag is enabled

Change-Id: I215593c205236a0f9669218437bb40b184ec6a4f
---
 src/runtime/CL/functions/CLGEMM.cpp                     |  5 +++--
 src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp | 13 ++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'src/runtime/CL')

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 6adbdc0cb6..baa0cf46dc 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -155,7 +155,8 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
     // Configure and tune matrix multiply kernel
     _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
                                                                                                         mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                        depth_output_gemm3d, reinterpret_input_as_3d));
+                                                                                                        depth_output_gemm3d, reinterpret_input_as_3d),
+                         gemm_info.fp_mixed_precision());
     CLScheduler::get().tune_kernel_static(_mm_kernel);
 
     if(_is_interleaved_transposed)
@@ -236,7 +237,7 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     }
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
 
     if(beta != 0 && c != nullptr)
     {
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 70bf3ae593..1abcb67132 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -104,9 +104,9 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
     // Check if the Winograd configuration requires fast math
     if(!enable_fast_math)
     {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
         ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
     }
-
     const WinogradInfo winograd_info = WinogradInfo(output_tile,
                                                     kernel_size,
                                                     input_dims,
@@ -129,7 +129,8 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
     _filter_transform.configure(weights, &_input1, winograd_info);
 
     // Configure batched matrix multiply
-    _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+    _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, GEMMLowpOutputStageInfo(),
+                                                                                                 (input->info()->data_type() == DataType::F16)));
 
     // Configure output transform
     _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
@@ -158,13 +159,10 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
     const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->data_layout());
 
-    //FP16 implementation of winograd is slower than direct convolution.
-    //The following check needs to be removed when fp16 winograd is faster than direct convolution (COMPMID-1266)
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-
     // Check if the Winograd configuration requires fast math
     if(!enable_fast_math)
     {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
     }
 
@@ -188,7 +186,8 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
     TensorShape batched_mm_output_shape = input0.tensor_shape();
     batched_mm_output_shape[0]          = input1.tensor_shape()[0];
     const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                                                                                                     GEMMLowpOutputStageInfo(), (input->data_type() == DataType::F16))));
 
     // Configure output transform
     ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info));
-- 
cgit v1.2.1