4 files changed, 48 insertions, 6 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index b788957dda..21a5a368ad 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -43,6 +43,7 @@ If there is more than one release in a month then an extra sequential number is
 
 v24.04 Public major release
  - Optimize start-up time of @ref NEConvolutionLayer for some input configurations where GeMM is selected as the convolution algorithm
+ - Optimize @ref NEConvolutionLayer for input tensor size > 1e7 bytes and weight tensor height > 7
 
 v24.02 Public major release
  - Replace template writer with compute kernel writer in dynamic fusion.
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
index 19311733db..26ca2ee783 100644
--- a/src/cpu/operators/CpuConv2d.cpp
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -209,12 +209,24 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *i
     }
     else
     {
+        const bool gemmDirectConv2d_validates =
+            bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info));
+
         // SRGAN
         // Output might not be initialized when it is an internal tensor of the layer using the convolution
-        if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) &&
-            (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
+        if (input->total_size() > 1e7 && weights->dimension(idx_h) > 7)
         {
-            return ConvolutionMethod::DIRECT;
+            // This configuration is memory demanding for GEMM method. GEMM_CONV2D which uses indirect convolution
+            // kernels underneath is the best option.
+            if (gemmDirectConv2d_validates)
+            {
+                return ConvolutionMethod::GEMM_CONV2D;
+            }
+            else if (bool(CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
+            {
+                // NCHW data layout is not supported by GEMM_CONV2D
+                return ConvolutionMethod::DIRECT;
+            }
         }
         if (input->dimension(idx_c) < 16)
         {
@@ -270,7 +282,7 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *i
         {
             return ConvolutionMethod::WINOGRAD;
         }
-        if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
+        if (gemmDirectConv2d_validates)
         {
             return ConvolutionMethod::GEMM_CONV2D;
         }
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index 72f73ba6d9..c299f2460b 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, 2023 Arm Limited.
+ * Copyright (c) 2017-2020, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -294,6 +294,16 @@ public:
     }
 };
 
+class VeryLargeConvolutionLayerDataset final : public ConvolutionLayerDataset
+{
+public:
+    VeryLargeConvolutionLayerDataset()
+    {
+        // Tensor size > 1e7 bytes && weight dimensions > 7
+        add_config(TensorShape(336U, 336U, 32U), TensorShape(9U, 9U, 32U, 64U), TensorShape(64U), TensorShape(168U, 168U, 64U), PadStrideInfo(2, 2, 4, 4));
+    }
+};
+
 class LargeGroupedConvolutionLayerDataset final : public ConvolutionLayerDataset
 {
 public:
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 62690c053e..7a9230d37a 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -109,6 +109,11 @@ const auto ActivationFunctionsDataset = make("ActivationInfo",
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f)
 });
 
+const auto NoActivation = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+});
+
 const auto ActivationFunctionsDatasetNightly = make("ActivationInfo",
 {
     ActivationLayerInfo(),
@@ -1201,6 +1206,20 @@ FIXTURE_DATA_TEST_CASE(RunPaddedWeights, NEGEMMConvolutionLayerPaddedWeightsFixt
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
 }
+
+// This very large shape test is required to test heuristic paths where the tensor size is > 1e7 bytes
+// and weight dimensions larger than 7
+FIXTURE_DATA_TEST_CASE(RunVeryLarge, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::VeryLargeConvolutionLayerDataset(),
+        framework::dataset::make("ReshapeWeights", { true }),
+        framework::dataset::make("DataType", DataType::F32),
+        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float