From 93581a524a8e66ed29ace892bc5cb297287802af Mon Sep 17 00:00:00 2001
From: Pablo Marquez Tello <pablo.tello@arm.com>
Date: Thu, 21 Jul 2022 13:55:27 +0100
Subject: [ONCPUML-970] Fast math mode for fixed format kernels

Minor tweaks and test for running fixed format kernels with BF16
operations when specified by the user.

Change-Id: Ic8167f67b86b1298da65e46cfebed9f3b86940e4
Signed-off-by: Milos Puzovic <milos.puzovic@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8000
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/Types.h                           |  10 +-
 .../NEON/kernels/arm_gemm/gemm_interleaved.hpp     |   2 +-
 .../operators/internal/CpuGemmAssemblyDispatch.cpp |   9 +-
 tests/main.cpp                                     |   3 +-
 tests/validation/NEON/ConvolutionLayer.cpp         | 168 +++++++++++++++------
 .../validation/fixtures/ConvolutionLayerFixture.h  |  35 ++---
 6 files changed, 159 insertions(+), 68 deletions(-)
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 66e1c8ab1f..952c174194 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1987,16 +1987,20 @@ enum class WeightFormat
 // OHWIo<interleave_by>i<block_by>
 inline int interleave_by(const WeightFormat wf)
 {
-    return ((int)wf >> 8) & 0xFFF;
+    return (static_cast<int>(wf) >> 8) & 0xFFF;
 }
 inline int block_by(const WeightFormat wf)
 {
-    return ((int)wf >> 20) & 0xF;
+    return (static_cast<int>(wf) >> 20) & 0xF;
 }
-inline bool is_fixed_format(const WeightFormat wf)
+inline bool is_fixed_format(const WeightFormat &wf)
 {
     return wf != WeightFormat::UNSPECIFIED && wf != WeightFormat::ANY;
 }
+inline bool is_fixed_format_fast_math(const WeightFormat &wf)
+{
+    return (static_cast<int>(wf) >> 4) & 0x1;
+}
 
 /** Convolution Layer Weights Information class. This class stores the necessary information to compute convolution layer when the weights are already reshaped */
 class WeightsInfo
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 6e14a68438..470cee1557 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -695,7 +695,7 @@ public:
 #endif
 
         /* Make sure we've been set up correctly. */
-        assert(_B_transposed);
+        assert(FixedFormat || _B_transposed);
         assert(_working_space);
         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
 
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index df02d649f8..77da83070b 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/AssemblyUtils.h"
 #include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
@@ -507,14 +508,20 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
             const TensorShape tensor_shape    = tensor_info->tensor_shape();
             const int         tensor_height   = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)];
             const int         tensor_width    = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)];
-            const int         tensor_channels = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)];
+            int               tensor_channels = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)];
             const int         interleave_by   = arm_compute::interleave_by(wf);
+            const int         blocked_by      = arm_compute::block_by(wf);
             // We need to find a new stride that is distance from the data for one
             // set of output channels to the next
             if(ldb == tensor_channels && multi_stride_b == tensor_channels * tensor_width)
             {
                 // In this case dimensions that are packed are height, width and channel
                 // so we need to stride it by interleave_by
+                if(tensor_channels % blocked_by != 0)
+                {
+                    // We need to pad
+                    tensor_channels = arm_gemm::iceildiv(tensor_channels, blocked_by) * blocked_by;
+                }
                 ldb = interleave_by * tensor_height * tensor_width * tensor_channels;
             }
             else if(multi_stride_b == 0 || (ldb == tensor_width && multi_stride_b == tensor_height * tensor_width))
diff --git a/tests/main.cpp b/tests/main.cpp
index bc264de378..ba18339e50 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -240,6 +240,7 @@ int main(int argc, char **argv)
                 p->print_entry("cpu_has_fp16", support::cpp11::to_string(cpu_info.has_fp16()));
                 p->print_entry("cpu_has_bf16", support::cpp11::to_string(cpu_info.has_bf16()));
                 p->print_entry("cpu_has_dotprod", support::cpp11::to_string(cpu_info.has_dotprod()));
+                p->print_entry("cpu_has_svebf16", support::cpp11::to_string(cpu_info.has_svebf16()));
 
                 for(unsigned int j = 0; j < num_cpus; ++j)
                 {
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 0194220e1a..67f7c8896f 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -509,23 +509,45 @@ TEST_SUITE(VariableWeightUtils)
 
 // UC2_1_* tests: the user requests a specific fixed format, but there is no kernel that supports it.
 
-FIXTURE_DATA_TEST_CASE(UC2_1_CpuGemmConv2d, HasOptImplFixture<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+template <typename ConvolutionClass>
+using HasOptImplFixtureNoFastMath = HasOptImplFixture<ConvolutionClass, /*enable_fast_math*/ false>;
+
+template <typename ConvolutionClass>
+using HasOptImplFixtureFastMath = HasOptImplFixture<ConvolutionClass, /*enable_fast_math*/ true>;
+
+// UC2_1
+
+FIXTURE_DATA_TEST_CASE(UC2_1_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+FIXTURE_DATA_TEST_CASE(UC2_1_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
 {
     ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
 }
-FIXTURE_DATA_TEST_CASE(UC2_1_NEGEMMConvolutionLayer, HasOptImplFixture<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+
+FIXTURE_DATA_TEST_CASE(UC2_1_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
 {
     ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
 }
 
-// UC2_1_* tests: the user requests a specific fixed format, and a
+FIXTURE_DATA_TEST_CASE(UC2_1_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+// UC2_2_* tests: the user requests a specific fixed format, and a
 // kernel that support that fixed format is found.
 
-FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d, HasOptImplFixture<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo4 })))
 {
@@ -533,7 +555,7 @@ FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d, HasOptImplFixture<cpu::CpuGemmConv2d
     ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo4, framework::LogLevel::ERRORS);
 }
 
-FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer, HasOptImplFixture<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo4 })))
 {
@@ -541,19 +563,49 @@ FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer, HasOptImplFixture<NEGEMMCon
     ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo4, framework::LogLevel::ERRORS);
 }
 
+FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT_EQUAL(_computed_weight_format, arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+}
+
 // UC3_1_* tests: the user queries for ANY fixed format, but there is
 // no kernel that support the use case specified by the user (for
 // example, there is no fixed format kernel for the datatype of the
 // problem).
 
-FIXTURE_DATA_TEST_CASE(UC3_1_CpuGemmConv2d, HasOptImplFixture<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE(UC3_1_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::S32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
 {
     ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
 }
 
-FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer, HasOptImplFixture<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::S32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
 {
@@ -570,7 +622,7 @@ FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer, HasOptImplFixture<NEGEMMCon
 // format like OHWIo4 for FP32 data returned for 128-bit NEON hardware
 // is replaced by OHWIo8 when running on 256-bit SVE.
 
-FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d, HasOptImplFixture<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
 {
@@ -579,7 +631,7 @@ FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d, HasOptImplFixture<cpu::CpuGemmConv2d
     ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
 }
 
-FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer, HasOptImplFixture<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
 {
@@ -587,6 +639,26 @@ FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer, HasOptImplFixture<NEGEMMCon
     ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
 }
 
+FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+}
+
 namespace
 {
 using TestCaseType          = std::tuple<TensorShape, TensorShape, arm_compute::WeightFormat>;
@@ -669,7 +741,7 @@ TEST_SUITE_END() // VariableWeightUtils
 TEST_SUITE(ExperimentalCpuAPIVariableWeightWithFixtures)
 
 template <typename ScalarType>
-using VarWidth = VariableWeightsFixture<cpu::CpuGemmConv2d, Tensor, Accessor, ScalarType>;
+using VarWidth = VariableWeightsFixture<cpu::CpuGemmConv2d, Tensor, Accessor, ScalarType, /*enable_fast_math*/ false>;
 
 FIXTURE_DATA_TEST_CASE(RunSmallFloat, VarWidth<float>, framework::DatasetMode::ALL,
                        combine(combine(datasets::SmallConvolutionLayerDataset(),
@@ -689,12 +761,26 @@ FIXTURE_DATA_TEST_CASE(RunSmallHalf, VarWidth<half>, framework::DatasetMode::ALL
     validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, half(abs_tolerance_f16));
 }
 
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+template <typename ScalarType>
+using VarWidthFastMath = VariableWeightsFixture<cpu::CpuGemmConv2d, Tensor, Accessor, ScalarType, /*enable_fast_math*/ true>;
+
+FIXTURE_DATA_TEST_CASE(RunSmallFloatFastMath, VarWidthFastMath<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+#endif // ARM_COMPUTE_ENABLE_BF16
+
 TEST_SUITE_END() // ExperimentalCpuAPIVariableWeightWithFixtures
 
 TEST_SUITE(ExperimentalNEAPIVariableWeightWithFixtures)
 
 template <typename ScalarType>
-using NEGEMMVarWidth = VariableWeightsFixtureNEInterface<NEGEMMConvolutionLayer, Tensor, Accessor, ScalarType>;
+using NEGEMMVarWidth = VariableWeightsFixtureNEInterface<NEGEMMConvolutionLayer, Tensor, Accessor, ScalarType, /*enable_fast_math*/ false>;
 
 FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallFloat, NEGEMMVarWidth<float>, framework::DatasetMode::ALL,
                        combine(combine(datasets::SmallConvolutionLayerDataset(),
@@ -714,6 +800,20 @@ FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallHalf, NEGEMMVarWidth<half>, framework::Data
     validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, half(abs_tolerance_f16));
 }
 
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+template <typename ScalarType>
+using NEGEMMVarWidthFastMath = VariableWeightsFixtureNEInterface<NEGEMMConvolutionLayer, Tensor, Accessor, ScalarType, /*enable_fast_math*/ true>;
+
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallFloatFastMath, NEGEMMVarWidthFastMath<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+#endif // ARM_COMPUTE_ENABLE_BF16
+
 TEST_SUITE_END() // ExperimentalNEAPIVariableWeightWithFixtures
 
 #endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
@@ -823,9 +923,7 @@ TEST_SUITE(Float)
 #if defined(ARM_COMPUTE_ENABLE_BF16)
 TEST_SUITE(BFLOAT16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                    framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                    framework::dataset::make("DataType", DataType::BFLOAT16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::BFLOAT16)), framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                                                                                             ActivationFunctionsDataset))
 {
     // Validate output
@@ -837,10 +935,7 @@ TEST_SUITE_END() // BFLOAT16
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                                                                                           ActivationFunctionsDataset))
+                                                                                                                   framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW })), ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
@@ -850,9 +945,7 @@ TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                    framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                             ActivationFunctionsDataset))
 {
     // Validate output
@@ -894,11 +987,8 @@ const auto QuantizedActivationFunctionsDataset = framework::dataset::make("Activ
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                       QuantizedActivationFunctionsDataset))
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })), QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -924,11 +1014,8 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                      framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
-                                                                                                                      QuantizedActivationFunctionsDataset))
+                                                                                                                      framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })), QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -1082,10 +1169,7 @@ TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                             ActivationFunctionsDataset))
+                                                                                                                     framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NHWC })), ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
@@ -1109,11 +1193,8 @@ const auto QuantizedActivationFunctionsDataset = framework::dataset::make("Activ
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                        QuantizedActivationFunctionsDataset))
+                                                                                                                        framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })), QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -1122,11 +1203,8 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
-                                                                                                                       QuantizedActivationFunctionsDataset))
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })), QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index c58a0a2c91..63e6dc9377 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -122,14 +122,14 @@ protected:
         {
             case DataType::QASYMM8:
             {
-                std::pair<int, int> bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                std::pair<int, int>                     bounds = get_quantized_bounds(tensor.quantization_info(), -1.0f, 1.0f);
                 std::uniform_int_distribution<uint32_t> distribution(bounds.first, bounds.second);
                 library->fill(tensor, distribution, i);
                 break;
             }
             case DataType::QASYMM8_SIGNED:
             {
-                std::pair<int, int> bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
+                std::pair<int, int>                    bounds = get_quantized_qasymm8_signed_bounds(tensor.quantization_info(), -1.0f, 1.0f);
                 std::uniform_int_distribution<int32_t> distribution(bounds.first, bounds.second);
                 library->fill(tensor, distribution, i);
                 break;
@@ -476,7 +476,7 @@ inline void rearrange_data(const AccessorType src, AccessorType dst, const arm_c
                 }
 }
 
-template <typename ConvolutionFunction, typename TensorClass, typename AccessorType, typename ScalarType>
+template <typename ConvolutionFunction, typename TensorClass, typename AccessorType, typename ScalarType, bool enable_fast_math>
 class VariableWeightsFixtureBaseClass : public framework::Fixture
 {
 public:
@@ -581,14 +581,14 @@ protected:
     SimpleTensor<ScalarType>             _reference{};
 };
 
-template <typename ConvolutionFunction, typename TensorClass, typename AccessorType, typename ScalarType>
-class VariableWeightsFixture : public VariableWeightsFixtureBaseClass<ConvolutionFunction, TensorClass, AccessorType, ScalarType>
+template <typename ConvolutionFunction, typename TensorClass, typename AccessorType, typename ScalarType, bool enable_fast_math>
+class VariableWeightsFixture : public VariableWeightsFixtureBaseClass<ConvolutionFunction, TensorClass, AccessorType, ScalarType, enable_fast_math>
 {
     void configure_and_execute_kernel(TensorInfo src_tensor_info, TensorInfo weight_tensor_info, TensorInfo bias_tensor_info, TensorInfo dst_tensor_info, const WeightsInfo weights_info,
                                       const PadStrideInfo &conv_info,
                                       const Size2D        &dilation)
     {
-        this->conv->configure(&src_tensor_info, &weight_tensor_info, &bias_tensor_info, &dst_tensor_info, conv_info, weights_info, dilation);
+        this->conv->configure(&src_tensor_info, &weight_tensor_info, &bias_tensor_info, &dst_tensor_info, conv_info, weights_info, dilation, ActivationLayerInfo(), enable_fast_math);
 
         // Allocate input tensors
         auto             src                 = create_tensor<TensorClass>(src_tensor_info);
@@ -624,8 +624,8 @@ class VariableWeightsFixture : public VariableWeightsFixtureBaseClass<Convolutio
     }
 };
 
-template <typename ConvolutionFunction, typename TensorClass, typename AccessorType, typename ScalarType>
-class VariableWeightsFixtureNEInterface : public VariableWeightsFixtureBaseClass<ConvolutionFunction, TensorClass, AccessorType, ScalarType>
+template <typename ConvolutionFunction, typename TensorClass, typename AccessorType, typename ScalarType, bool enable_fast_math>
+class VariableWeightsFixtureNEInterface : public VariableWeightsFixtureBaseClass<ConvolutionFunction, TensorClass, AccessorType, ScalarType, enable_fast_math>
 {
     void configure_and_execute_kernel(TensorInfo src_tensor_info, TensorInfo weight_tensor_info, TensorInfo bias_tensor_info, TensorInfo dst_tensor_info, const WeightsInfo weights_info,
                                       const PadStrideInfo &conv_info,
@@ -644,7 +644,7 @@ class VariableWeightsFixtureNEInterface : public VariableWeightsFixtureBaseClass
         // Allocate destination tensor
         this->_target = create_tensor<TensorClass>(dst_tensor_info);
         this->_target.allocator()->allocate();
-        this->conv->configure(&src, &weights_transformed, &bias, &(this->_target), conv_info, weights_info, dilation);
+        this->conv->configure(&src, &weights_transformed, &bias, &(this->_target), conv_info, weights_info, dilation, ActivationLayerInfo(), enable_fast_math);
         // Prepare source and biases that are left unchanged.
         this->fill(AccessorType(src), 0);
         this->fill(AccessorType(bias), 1);
@@ -664,7 +664,7 @@ class VariableWeightsFixtureNEInterface : public VariableWeightsFixtureBaseClass
     }
 };
 
-template <typename ConvolutionClass>
+template <typename ConvolutionClass, bool enable_fast_math>
 class HasOptImplFixture : public framework::Fixture
 {
 public:
@@ -672,14 +672,15 @@ public:
     void setup(DataType data_type, arm_compute::WeightFormat query_weight_format)
     {
         auto              conv        = std::make_unique<ConvolutionClass>();
-        const auto        src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, data_type, DataLayout::NHWC);
-        const auto        weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, data_type, DataLayout::NHWC);
-        const auto        bias_info   = TensorInfo(TensorShape(3U), 1, data_type, DataLayout::NHWC);
-        auto              dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, data_type, DataLayout::NHWC);
-        const auto        conv_info   = PadStrideInfo(1, 1, 0, 0, 2, 2, DimensionRoundingType::FLOOR);
-        const WeightsInfo weights_info(false, 3U, 3U, 1U, false, query_weight_format);
+        const auto        src_info    = TensorInfo(TensorShape(56U, 56U, 64U), 1, data_type, DataLayout::NHWC);
+        const auto        weight_info = TensorInfo(TensorShape(64, 3U, 3U, 64U), 1, enable_fast_math ? DataType::BFLOAT16 : data_type, DataLayout::NHWC);
+        const auto        bias_info   = TensorInfo(TensorShape(64U), 1, data_type, DataLayout::NHWC);
+        auto              dst_info    = TensorInfo(TensorShape(56U, 56U, 64U), 1, data_type, DataLayout::NHWC);
+        const auto        conv_info   = PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR);
+        const WeightsInfo weights_info(false, 3U, 3U, 64U, false, query_weight_format);
         _kernel_found = bool(ConvolutionClass::has_opt_impl(_computed_weight_format, &src_info, &weight_info,
-                                                            &bias_info, &dst_info, conv_info, weights_info));
+                                                            &bias_info, &dst_info, conv_info, weights_info,
+                                                            /*dilation*/ Size2D(1U, 1U), /*act_info*/ ActivationLayerInfo(), enable_fast_math));
     }
 
 protected:
-- 
cgit v1.2.1