136 files changed, 6964 insertions, 5968 deletions
diff --git a/tests/validation/NEON/AbsoluteDifference.cpp b/tests/validation/NEON/AbsoluteDifference.cpp
deleted file mode 100644
index b39023eead..0000000000
--- a/tests/validation/NEON/AbsoluteDifference.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/AbsoluteDifferenceFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Input data sets **/
-const auto AbsoluteDifferenceU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
-                                                 DataType::U8));
-const auto AbsoluteDifferenceS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
-                                                  framework::dataset::make("DataType", DataType::S16));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(AbsoluteDifference)
-
-template <typename T>
-using NEAbsoluteDifferenceFixture = AbsoluteDifferenceValidationFixture<Tensor, Accessor, NEAbsoluteDifference, T>;
-
-TEST_SUITE(U8)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsoluteDifferenceFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AbsoluteDifferenceU8Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAbsoluteDifferenceFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AbsoluteDifferenceU8Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsoluteDifferenceFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AbsoluteDifferenceS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAbsoluteDifferenceFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AbsoluteDifferenceS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // S16
-
-TEST_SUITE_END() // AbsoluteDifference
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Accumulate.cpp b/tests/validation/NEON/Accumulate.cpp
deleted file mode 100644
index 2b5ebbb0e7..0000000000
--- a/tests/validation/NEON/Accumulate.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/AccumulateFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance value for comparing reference's output against implementation's output for floating point data types */
-constexpr AbsoluteTolerance<float> tolerance(1.0f);
-/** Input data sets **/
-const auto AccumulateU8Dataset  = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8));
-const auto AccumulateS16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16));
-} // namespace
-TEST_SUITE(NEON)
-TEST_SUITE(Accumulate)
-
-TEST_SUITE(U8)
-
-template <typename T1>
-using NEAccumulateFixture = AccumulateValidationFixture<Tensor, Accessor, NEAccumulate, T1, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAccumulateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAccumulateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-
-TEST_SUITE_END() // U8
-TEST_SUITE_END() // Accumulate
-
-TEST_SUITE(AccumulateWeighted)
-
-TEST_SUITE(U8)
-
-template <typename T1>
-using NEAccumulateWeightedFixture = AccumulateWeightedValidationFixture<Tensor, Accessor, NEAccumulateWeighted, T1, uint8_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAccumulateWeightedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AccumulateU8Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAccumulateWeightedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AccumulateU8Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-
-TEST_SUITE_END() // U8
-TEST_SUITE_END() // AccumulateWeighted
-
-TEST_SUITE(AccumulateSquared)
-
-TEST_SUITE(U8)
-
-template <typename T1>
-using NEAccumulateSquaredFixture = AccumulateSquaredValidationFixture<Tensor, Accessor, NEAccumulateSquared, T1, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAccumulateSquaredFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAccumulateSquaredFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-
-TEST_SUITE_END() // U8
-TEST_SUITE_END() // AccumulateSquared
-
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 0ef4590d7e..73f5de68ac 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,13 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/RuntimeContext.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuActivationKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ActivationFunctionsDataset.h"
@@ -37,7 +40,8 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ActivationLayerFixture.h"
 
-#include "support/Requires.h"
+#include "arm_compute/Acl.hpp"
+#include "support/AclRequires.h"
 
 namespace arm_compute
 {
@@ -48,8 +52,7 @@ namespace validation
 namespace
 {
 RelativeTolerance<float> tolerance_float_sqrt(0.0001f);
-    
-    
+
 /** Define relative tolerance of the activation layer.
  *
  * @param[in] data_type  The data type used.
@@ -62,18 +65,35 @@ RelativeTolerance<float> relative_tolerance(DataType data_type, ActivationLayerI
     switch(activation)
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
         case ActivationLayerInfo::ActivationFunction::ELU:
         case ActivationLayerInfo::ActivationFunction::SQRT:
         case ActivationLayerInfo::ActivationFunction::TANH:
         case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+        case ActivationLayerInfo::ActivationFunction::SWISH:
+        case ActivationLayerInfo::ActivationFunction::GELU:
             switch(data_type)
             {
                 case DataType::F16:
+#if defined(ENABLE_SVE)
+                    return RelativeTolerance<float>(0.25f);
+#else  // !defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.1f);
+#endif // defined(ENABLE_SVE)
                 default:
                     return RelativeTolerance<float>(0.05f);
             }
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+            switch(data_type)
+            {
+                case DataType::F16:
+#if defined(ENABLE_SVE)
+                    return RelativeTolerance<float>(0.9f);
+#else  // !defined(ENABLE_SVE)
+                    return RelativeTolerance<float>(0.01f);
+#endif // defined(ENABLE_SVE)
+                default:
+                    return RelativeTolerance<float>(0.00001f);
+            }
         default:
             return RelativeTolerance<float>(0.f);
     }
@@ -91,14 +111,30 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
     switch(activation)
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
         case ActivationLayerInfo::ActivationFunction::SQRT:
         case ActivationLayerInfo::ActivationFunction::TANH:
+        case ActivationLayerInfo::ActivationFunction::SWISH:
         case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
             switch(data_type)
             {
                 case DataType::F16:
+#if defined(ENABLE_SVE)
+                    return AbsoluteTolerance<float>(0.25f);
+#else  // !defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.01f);
+#endif // defined(ENABLE_SVE)
+                default:
+                    return AbsoluteTolerance<float>(0.00001f);
+            }
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+            switch(data_type)
+            {
+                case DataType::F16:
+#if defined(ENABLE_SVE)
+                    return AbsoluteTolerance<float>(0.9f);
+#else  // !defined(ENABLE_SVE)
+                    return AbsoluteTolerance<float>(0.01f);
+#endif // defined(ENABLE_SVE)
                 default:
                     return AbsoluteTolerance<float>(0.00001f);
             }
@@ -107,12 +143,27 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
     }
 }
 
-/** Tolerance for quantized asymmetric operations */
-#if defined(__aarch64__)
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
-#else  // defined(__aarch64__)
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
-#endif // defined(__aarch64__)
+/** Define absolute tolerance of the activation layer for qasymm8.
+ *
+ * @param[in] activation The activation function used.
+ *
+ * @return Absolute tolerance depending on the activation function.
+ */
+AbsoluteTolerance<uint8_t> tolerance_qasymm8(ActivationLayerInfo::ActivationFunction activation)
+{
+    switch(activation)
+    {
+        case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+        case ActivationLayerInfo::ActivationFunction::SQRT:
+        case ActivationLayerInfo::ActivationFunction::TANH:
+        case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+        case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+            return AbsoluteTolerance<uint8_t>(1);
+        default:
+            return AbsoluteTolerance<uint8_t>(0);
+    }
+}
 
 constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(1);
 
@@ -125,12 +176,13 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto NeonActivationFunctionsDataset = concat(datasets::ActivationFunctions(), framework::dataset::make("ActivationFunction", ActivationLayerInfo::ActivationFunction::HARD_SWISH));
+const auto NeonActivationFunctionsDataset = concat(datasets::ActivationFunctions(),
+                                                   framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::SWISH }));
 
 /** Input data sets. */
 const auto ActivationDataset = combine(combine(framework::dataset::make("InPlace", { false, true }), NeonActivationFunctionsDataset), framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
 
-template <typename T, REQUIRES_TA(arm_compute::utils::traits::is_floating_point<T>::value)>
+template <typename T, ARM_COMPUTE_REQUIRES_TA(arm_compute::utils::traits::is_floating_point<T>::value)>
 void test_float_sqrt_boundary_value()
 {
     constexpr auto vector_size = uint32_t{ 16 };
@@ -171,6 +223,48 @@ void test_float_sqrt_boundary_value()
 TEST_SUITE(NEON)
 TEST_SUITE(ActivationLayer)
 
+/** Test case for memory injection in @ref cpu::CpuWinogradConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(ActivationAPI, framework::DatasetMode::ALL)
+{
+    acl::StatusCode err = acl::StatusCode::Success;
+
+    // Create context & Queue
+    acl::Context ctx(acl::Target::Cpu, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    acl::Queue queue(ctx, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    // Create activation operator
+    acl::TensorDescriptor src_info({ 2, 3 }, acl::DataType::Float32);
+    acl::TensorDescriptor dst_info({ 2, 3 }, acl::DataType::Float32);
+    acl::ActivationDesc   desc{ AclRelu, 6.f, 0.f, false };
+
+    acl::Activation act(ctx, src_info, dst_info, desc, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    // Create tensors and feed
+    acl::Tensor src(ctx, src_info, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+    acl::Tensor dst(ctx, dst_info, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    acl::TensorPack pack(ctx);
+    err = pack.add(src, ACL_SRC);
+    err = pack.add(dst, ACL_DST);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    // Execute operator
+    err = act.run(queue, pack);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -192,6 +286,49 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     bool is_valid = bool(NEActivationLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), act_info));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QSYMM16
+                                                            })),
+                combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                        framework::dataset::make("DataType", { DataType::F32,
+                                                               DataType::F16,
+                                                             }))),
+                combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                        framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                               DataType::QASYMM8_SIGNED,
+                                                               DataType::QSYMM16
+                                                             }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{data_type, CPUModel::GENERIC, cpu_isa,ActivationLayerInfo::ActivationFunction::BOUNDED_RELU}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation";
+    if( data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)
+    {
+#ifdef __aarch64__
+        expected = "neon_q8_activation_lut";
+#else  // __aarch64__
+        expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation";
+#endif // __aarch64__
+    }
+    std::string actual   = selected_impl->name;
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
@@ -234,12 +371,15 @@ template <typename T>
 using NEActivationLayerQuantizedFixture = ActivationValidationQuantizedFixture<Tensor, Accessor, NEActivationLayer, T>;
 
 /** Input data sets. */
-const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-                                                                                                  ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                                  ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                                  ActivationLayerInfo::ActivationFunction::LOGISTIC,
-                                                                                                  ActivationLayerInfo::ActivationFunction::TANH
-                                                                                                });
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction",
+{
+    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::RELU,
+    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
+});
 
 const auto QuantizedActivationDataset = combine(combine(framework::dataset::make("InPlace", { false }),
                                                         concat(QuantizedActivationFunctionsDataset, framework::dataset::make("ActivationFunction", ActivationLayerInfo::ActivationFunction::HARD_SWISH))),
@@ -253,7 +393,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<uint8_t>, fra
                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 128.0f) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8(_function));
 }
 TEST_SUITE_END() // QASYMM8
 
@@ -264,14 +404,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<int8_t>, fram
                                                                                                                  framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10.0f) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8(_function));
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 /** Input data sets. */
-const auto Int16QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::LOGISTIC,
-                                                                                                       ActivationLayerInfo::ActivationFunction::TANH
-                                                                                                     });
+const auto Int16QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction",
+{
+    ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+});
 const auto Int16QuantizedActivationDataset = combine(combine(framework::dataset::make("InPlace", { false }), Int16QuantizedActivationFunctionsDataset),
                                                      framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
 
@@ -288,7 +431,7 @@ TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // ActivationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/AddMulAdd.cpp b/tests/validation/NEON/AddMulAdd.cpp
new file mode 100644
index 0000000000..77e3d80fe6
--- /dev/null
+++ b/tests/validation/NEON/AddMulAdd.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/AddMulAddFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f);     /**< Tolerance for floating point tests */
+const AbsoluteTolerance<half>      tolerance_fp16(half(0.1f)); /**< Tolerance for 16-bit floating point tests */
+constexpr AbsoluteTolerance<float> tolerance_quant(1);         /**< Tolerance for quantized tests */
+
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+
+    // Boundaries are aligned with Quantized Data ranges -- DOUBLE check before changing
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, -2.f)
+});
+
+// QASYMM8 test quantizations
+const auto qasymm8_input1_qinfo_set = framework::dataset::make("Input1QInfo", { QuantizationInfo(0.1, 10) }); // Representable Range: [-1, 24.5]
+const auto qasymm8_input2_qinfo_set = framework::dataset::make("Input2QInfo", { QuantizationInfo(0.2, 60) }); // Representable Range: [-12, 39]
+const auto qasymm8_bn_mul_qinfo_set = framework::dataset::make("BnMulInfo", { QuantizationInfo(0.001, 55) }); // Representable Range: [-0.11, 0.2]
+const auto qasymm8_bn_add_qinfo_set = framework::dataset::make("BnAddInfo", { QuantizationInfo(0.02, 20) });  // Representable Range: [-0.4, 4.7]
+
+// Representable Range: [-9.36, 51.84], Expected F32 range: [-13, 63.5], leaving some space for saturation
+const auto qasymm8_add_output_qinfo_set = framework::dataset::make("AddOutputInfo", { QuantizationInfo(0.24, 39) });
+
+// Representable Range: [-4.8, 10.5], Expected FP32 range: [-6.985, 12.7], leaving some space for saturation
+// This range also makes sense with the activation boundaries above, i.e. [-2, 8] for LU_BOUNDED_RELU and [0, 6] for BOUNDED_RELU
+const auto qasymm8_final_output_qinfo_set = framework::dataset::make("FinalOutputInfo", { QuantizationInfo(0.06, 80) });
+
+// QASYMM8_SIGNED test quantizations
+const auto qasymm8_signed_input1_qinfo_set = framework::dataset::make("Input1QInfo", { QuantizationInfo(0.1, 10) });  // Representable Range: [-13.8, 11.7]
+const auto qasymm8_signed_input2_qinfo_set = framework::dataset::make("Input2QInfo", { QuantizationInfo(0.2, -60) }); // Representable Range: [-13.6, 39.4]
+const auto qasymm8_signed_bn_mul_qinfo_set = framework::dataset::make("BnMulInfo", { QuantizationInfo(0.001, 55) });  // Representable Range: [-0.183, 0.072]
+const auto qasymm8_signed_bn_add_qinfo_set = framework::dataset::make("BnAddInfo", { QuantizationInfo(0.4, -120) });  // Representable Range: [-0.32, 9.08]
+
+// Representable Range: [-21.36, 39.84], Expected F32 range: [-27.4, 51.1], leaving some space for saturation
+const auto qasymm8_signed_add_output_qinfo_set = framework::dataset::make("AddOutputInfo", { QuantizationInfo(0.24, -39) });
+
+// Representable Range: [-4.8, 10.5], Expected FP32 range: [-9.6713, 14.0942], leaving some space for saturation
+// This range also makes sense with the activation boundaries above, i.e. [-2, 8] for LU_BOUNDED_RELU and [0, 6] for BOUNDED_RELU
+const auto qasymm8_signed_final_output_qinfo_set = framework::dataset::make("FinalOutputInfo", { QuantizationInfo(0.06, -48) });
+
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(AddMulAdd)
+
+template <typename T>
+using NEAddMulAddFloatFixture = AddMulAddFloatValidationFixture<Tensor, Accessor, NEAddMulAdd, T, true>;
+
+template <typename T>
+using NEAddMulAddFloatFixtureWoIntermOut = AddMulAddFloatValidationFixture<Tensor, Accessor, NEAddMulAdd, T, false>;
+
+TEST_SUITE(Float)
+
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulAddFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                            ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+// This test is to stress the case when there is no intermediate output required (i.e. nullptr)
+FIXTURE_DATA_TEST_CASE(RunSmallWithoutIntermOutput, NEAddMulAddFloatFixtureWoIntermOut<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })))
+{
+    // Validate outputs
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulAddFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                                  framework::dataset::make("DataType", DataType::F32)),
+                                                                                                          ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+TEST_SUITE_END() // F32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(F16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulAddFloatFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                                                           ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulAddFloatFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
+                                                                                                         ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+TEST_SUITE_END() // F16
+#endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+TEST_SUITE_END() // Float
+
+template <typename T>
+using NEAddMulQuantizedFixture = AddMulAddQuantizedValidationFixture<Tensor, Accessor, NEAddMulAdd, T, true>;
+
+template <typename T>
+using NEAddMulAddQuantizedFixtureWoIntermOut = AddMulAddQuantizedValidationFixture<Tensor, Accessor, NEAddMulAdd, T, false>;
+
+TEST_SUITE(Quantized)
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                       ActivationFunctionsDataset),
+                                                                                                                       qasymm8_input1_qinfo_set),
+                                                                                                                       qasymm8_input2_qinfo_set),
+                                                                                                                       qasymm8_bn_mul_qinfo_set),
+                                                                                                                       qasymm8_bn_add_qinfo_set),
+                                                                                                                       qasymm8_add_output_qinfo_set),
+                                                                                                               qasymm8_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
+                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                     ActivationFunctionsDataset),
+                                                                                                                     qasymm8_input1_qinfo_set),
+                                                                                                                     qasymm8_input2_qinfo_set),
+                                                                                                                     qasymm8_bn_mul_qinfo_set),
+                                                                                                                     qasymm8_bn_add_qinfo_set),
+                                                                                                                     qasymm8_add_output_qinfo_set),
+                                                                                                             qasymm8_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                      ActivationFunctionsDataset),
+                                                                                                                      qasymm8_signed_input1_qinfo_set),
+                                                                                                                      qasymm8_signed_input2_qinfo_set),
+                                                                                                                      qasymm8_signed_bn_mul_qinfo_set),
+                                                                                                                      qasymm8_signed_bn_add_qinfo_set),
+                                                                                                                      qasymm8_signed_add_output_qinfo_set),
+                                                                                                              qasymm8_signed_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                    ActivationFunctionsDataset),
+                                                                                                                    qasymm8_signed_input1_qinfo_set),
+                                                                                                                    qasymm8_signed_input2_qinfo_set),
+                                                                                                                    qasymm8_signed_bn_mul_qinfo_set),
+                                                                                                                    qasymm8_signed_bn_add_qinfo_set),
+                                                                                                                    qasymm8_signed_add_output_qinfo_set),
+                                                                                                            qasymm8_signed_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE_END() // AddMulAdd
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif // __aarch64__
diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp
index 6aa382d9b4..91b8128dea 100644
--- a/tests/validation/NEON/ArgMinMax.cpp
+++ b/tests/validation/NEON/ArgMinMax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,27 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+const auto OpsDataset   = framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX });
+const auto AxisDataset  = framework::dataset::make("Axis", { 0, 1, 2, 3 });
+const auto QInfoDataset = framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) });
+
+const auto ArgMinMaxSmallDatasetAxis0 = framework::dataset::make("Shape",
+{
+    TensorShape{ 1U, 5U },
+    TensorShape{ 2U, 3U },
+    TensorShape{ 1U },
+    TensorShape{ 3U },
+    TensorShape{ 2U },
+    TensorShape{ 5U },
+    TensorShape{ 17U },
+    TensorShape{ 15U, 2U },
+});
+using ArgMinMaxSmallDataset = datasets::Small4DShapes;
+using ArgMinMaxLargeDataset = datasets::Large4DShapes;
+}
+
 TEST_SUITE(NEON)
 TEST_SUITE(ArgMinMax)
 
@@ -70,23 +91,50 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-template <typename T>
-using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+template <typename T1, typename T2>
+using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor, NEArgMinMaxLayer, T1, T2>;
+
+using NEArgMinMaxValidationFixture_S32_S32 = NEArgMinMaxValidationFixture<int32_t, int32_t>;
+using NEArgMinMaxValidationFixture_F16_S32 = NEArgMinMaxValidationFixture<half, int32_t>;
+using NEArgMinMaxValidationFixture_F32_S32 = NEArgMinMaxValidationFixture<float, int32_t>;
+#ifdef __aarch64__
+using NEArgMinMaxValidationFixture_F32_S64 = NEArgMinMaxValidationFixture<float, int64_t>;
+#endif // __aarch64__
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxValidationFixture<int32_t>,
+FIXTURE_DATA_TEST_CASE(RunSmallAxis0,
+                       NEArgMinMaxValidationFixture_S32_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDatasetAxis0,
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       framework::dataset::make("Axis", { 0 })),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEArgMinMaxValidationFixture_S32_S32,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxValidationFixture<int32_t>,
+                       NEArgMinMaxValidationFixture_S32_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -97,18 +145,26 @@ TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxValidationFixture<half>,
+                       NEArgMinMaxValidationFixture_F16_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F16)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxValidationFixture<half>,
+                       NEArgMinMaxValidationFixture_F16_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F16)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -118,18 +174,41 @@ TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxValidationFixture<float>,
+                       NEArgMinMaxValidationFixture_F32_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
+#ifdef __aarch64__
+FIXTURE_DATA_TEST_CASE(RunSmall_F32_S64,
+                       NEArgMinMaxValidationFixture_F32_S64,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S64)),
+                                       AxisDataset),
+                               OpsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+#endif // __aarch64__
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxValidationFixture<float>,
+                       NEArgMinMaxValidationFixture_F32_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -137,27 +216,35 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
-template <typename T>
-using NEArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+template <typename T1, typename T2>
+using NEArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<Tensor, Accessor, NEArgMinMaxLayer, T1, T2>;
+
+using NEArgMinMaxQuantizedValidationFixture_U8_S32 = NEArgMinMaxQuantizedValidationFixture<uint8_t, int32_t>;
+using NEArgMinMaxQuantizedValidationFixture_S8_S32 = NEArgMinMaxQuantizedValidationFixture<int8_t, int32_t>;
 
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_U8_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_U8_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -166,29 +253,34 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxQuantizedValidationFixture<int8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_S8_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 127.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxQuantizedValidationFixture<int8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_S8_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 127.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // ArgMinMax
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
index 7b3d4f9ac0..535c3e634e 100644
--- a/tests/validation/NEON/ArithmeticAddition.cpp
+++ b/tests/validation/NEON/ArithmeticAddition.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ConvertPolicyDataset.h"
@@ -43,29 +46,13 @@ namespace validation
 {
 namespace
 {
-#ifndef __aarch64__
+#if !defined(__aarch64__) || defined(ENABLE_SVE)
 constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
-#endif                                                 //__aarch64__
-
-/** Input data sets **/
-const auto ArithmeticAdditionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
-                                                 DataType::U8));
-const auto ArithmeticAdditionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
-                                                  framework::dataset::make("DataType", DataType::S16));
-const auto ArithmeticAdditionS32Dataset = combine(combine(framework::dataset::make("DataType", { DataType::S32 }), framework::dataset::make("DataType", DataType::S32)),
-                                                  framework::dataset::make("DataType", DataType::S32));
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-const auto ArithmeticAdditionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
-                                                   framework::dataset::make("DataType", DataType::F16));
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-const auto ArithmeticAdditionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
-                                                   framework::dataset::make("DataType", DataType::F32));
-const auto ArithmeticAdditionQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                      framework::dataset::make("DataType", DataType::QASYMM8));
-const auto ArithmeticAdditionQASYMM8SIGNEDDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                            framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-const auto ArithmeticAdditionQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16), framework::dataset::make("DataType", DataType::QSYMM16)),
-                                                      framework::dataset::make("DataType", DataType::QSYMM16));
+#else                                                  // !defined(__aarch64__) || defined(ENABLE_SVE)
+constexpr AbsoluteTolerance<float> tolerance_quant(0);
+#endif                                                 // !defined(__aarch64__) || defined(ENABLE_SVE)
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -77,25 +64,22 @@ using NEArithmeticAdditionFixture = ArithmeticAdditionValidationFixture<Tensor,
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8), // Unsupported broadcast
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),// Mismatching shapes
                                                       }),
-               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(1U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, false, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     Status s = NEArithmeticAddition::validate(&input1_info.clone()->set_is_resizable(false),
@@ -104,6 +88,63 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                               ConvertPolicy::WRAP);
     ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
+                combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::U8,
+                                                              DataType::S16,
+                                                              DataType::S32,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QSYMM16
+                                                            })),
+                        framework::dataset::make("CanUseFixedpoint", {true, false})),
+                combine(combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                        framework::dataset::make("DataType", { DataType::F32,
+                                                               DataType::F16,
+                                                               DataType::U8,
+                                                               DataType::S16,
+                                                               DataType::S32
+                                                             })),
+                        framework::dataset::make("CanUseFixedpoint", {true, false}))),
+                combine(combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                        framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                               DataType::QASYMM8_SIGNED,
+                                                               DataType::QSYMM16
+                                                             })),
+                        framework::dataset::make("CanUseFixedpoint", {true, false}))),
+               cpu_ext, data_type, can_use_fixedpoint)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{data_type, cpu_isa, can_use_fixedpoint}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    bool qasymm8_any = (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED);
+
+    std::string expected;
+    if(qasymm8_any && can_use_fixedpoint)
+    {
+        expected = "neon_" + cpu_impl_dt(data_type) + "_add_fixedpoint";
+    }
+    else
+    {
+        expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_add";
+    }
+
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
@@ -125,8 +166,10 @@ TEST_CASE(NoPaddingAdded, framework::DatasetMode::PRECOMMIT)
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionU8Dataset),
-                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::U8)),
+                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                  OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -134,15 +177,19 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionS16Dataset),
-                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::S16)),
+                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                  OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticAdditionS16Dataset),
-                                                                                                                framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                        DataType::S16)),
+                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -150,8 +197,10 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<int16_t>, framework
 TEST_SUITE_END() // S16
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionS32Dataset),
-                                                                                                            framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                    DataType::S32)),
+                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                            OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -162,8 +211,9 @@ TEST_SUITE_END() // Integer
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP16Dataset),
-                                                                                                         framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                 framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                         OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -172,15 +222,19 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP32Dataset),
-                                                                                                                framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                        DataType::F32)),
+                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticAdditionFP32Dataset),
-                                                                                                              framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                      DataType::F32)),
+                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                              OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -189,17 +243,19 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<float>, framework::
 template <typename T>
 using NEArithmeticAdditionBroadcastFixture = ArithmeticAdditionBroadcastValidationFixture<Tensor, Accessor, NEArithmeticAddition, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapesBroadcast(),
-                       ArithmeticAdditionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
-                       ArithmeticAdditionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -218,18 +274,15 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionQASYMM8Dataset),
-                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
-#ifdef __aarch64__
-    validate(Accessor(_target), _reference);
-#else  //__aarch64__
     validate(Accessor(_target), _reference, tolerance_quant);
-#endif //__aarch64__
 }
 TEST_SUITE_END() // QASYMM8
 
@@ -237,33 +290,27 @@ TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<int8_t>,
                        framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionQASYMM8SIGNEDDataset),
-                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })))
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
-#ifdef __aarch64__
-    validate(Accessor(_target), _reference);
-#else  //__aarch64__
     validate(Accessor(_target), _reference, tolerance_quant);
-#endif //__aarch64__
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionQuantizedBroadcastFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(
-                           datasets::SmallShapesBroadcast(), ArithmeticAdditionQASYMM8SIGNEDDataset),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionQuantizedBroadcastFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(
+                           datasets::SmallShapesBroadcast(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
-#ifdef __aarch64__
-    validate(Accessor(_target), _reference);
-#else  //__aarch64__
     validate(Accessor(_target), _reference, tolerance_quant);
-#endif //__aarch64__
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 
@@ -271,24 +318,21 @@ TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<int16_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionQSYMM16Dataset),
-                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QSYMM16)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
-#ifdef __aarch64__
-    validate(Accessor(_target), _reference);
-#else  //__aarch64__
     validate(Accessor(_target), _reference, tolerance_quant);
-#endif //__aarch64__
 }
 TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // ArithmeticAddition
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp
index 12fe64c396..8886ca2db5 100644
--- a/tests/validation/NEON/ArithmeticSubtraction.cpp
+++ b/tests/validation/NEON/ArithmeticSubtraction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,45 +50,16 @@ constexpr AbsoluteTolerance<float> tolerance_qasymm8(1); /**< Tolerance value fo
 #endif                                                     //__aarch64__
 constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 
-/** Input data sets **/
-const auto ArithmeticSubtractionQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8),
-                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                         framework::dataset::make("DataType", DataType::QASYMM8));
-
-const auto ArithmeticSubtractionQASYMM8SIGNEDDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
-                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-
-const auto ArithmeticSubtractionQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16),
-                                                                 framework::dataset::make("DataType", DataType::QSYMM16)),
-                                                         framework::dataset::make("DataType", DataType::QSYMM16));
-
-const auto ArithmeticSubtractionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8),
-                                                            framework::dataset::make("DataType", DataType::U8)),
-                                                    framework::dataset::make("DataType", DataType::U8));
-
-const auto ArithmeticSubtractionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }),
-                                                             framework::dataset::make("DataType", DataType::S16)),
-                                                     framework::dataset::make("DataType", DataType::S16));
-
-const auto ArithmeticSubtractionS32Dataset = combine(combine(framework::dataset::make("DataType", DataType::S32),
-                                                             framework::dataset::make("DataType", DataType::S32)),
-                                                     framework::dataset::make("DataType", DataType::S32));
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-const auto ArithmeticSubtractionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16),
-                                                              framework::dataset::make("DataType", DataType::F16)),
-                                                      framework::dataset::make("DataType", DataType::F16));
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-const auto ArithmeticSubtractionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32),
-                                                              framework::dataset::make("DataType", DataType::F32)),
-                                                      framework::dataset::make("DataType", DataType::F32));
-
+// Quantization Infomation DataSet
 const auto ArithmeticSubtractionQuantizationInfoDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(10, 120) }),
                                                                           framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(20, 110) })),
                                                                   framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(15, 125) }));
 const auto ArithmeticSubtractionQuantizationInfoSignedDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.5f, 10) }),
                                                                                 framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.5f, 20) })),
                                                                         framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.5f, 50) }));
+const auto ArithmeticSubtractionQuantizationInfoSignedInPlaceDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.8f, 10) }),
+                                                                                       framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.8f, 10) })),
+                                                                               framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.8f, 10) }));
 const auto ArithmeticSubtractionQuantizationInfoSymmetric = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.3f, 0) }),
                                                                             framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.7f, 0) })),
                                                                     framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.2f, 0) }));
@@ -105,35 +76,31 @@ using NEArithmeticSubtractionFixture = ArithmeticSubtractionValidationFixture<Te
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-        framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+        framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                  TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::QASYMM8), // Mismatching types
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), // Invalid convert policy
         }),
-        framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+        framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
         })),
-        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
         })),
-        framework::dataset::make("ConvertPolicy",{ ConvertPolicy::WRAP,
-                                                ConvertPolicy::SATURATE,
-                                                ConvertPolicy::SATURATE,
-                                                ConvertPolicy::WRAP,
-                                                ConvertPolicy::WRAP,
-                                                ConvertPolicy::WRAP,
+        framework::dataset::make("ConvertPolicy",{ ConvertPolicy::SATURATE,
+                                                   ConvertPolicy::SATURATE,
+                                                   ConvertPolicy::WRAP,
+                                                   ConvertPolicy::WRAP,
+                                                   ConvertPolicy::WRAP,
         })),
-        framework::dataset::make("Expected", { true, true, false, false, false, false})),
+        framework::dataset::make("Expected", { true, false, false, false, false})),
         input1_info, input2_info, output_info, policy, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NEArithmeticSubtraction::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), policy)) == expected, framework::LogLevel::ERRORS);
@@ -194,7 +161,8 @@ TEST_CASE(InvalidBroadcastBoth, framework::DatasetMode::ALL)
 TEST_SUITE_END() // InPlaceValidate
 
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::U8)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      OutOfPlaceDataSet))
 {
@@ -210,10 +178,11 @@ using NEArithmeticSubtractionQSYMM16Fixture                = ArithmeticSubtracti
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionQASYMM8Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::QASYMM8)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                                                                                                                      ArithmeticSubtractionQuantizationInfoDataset),
-                                                                                                             InPlaceDataSet))
+                                                                                                             OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -222,19 +191,17 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
-                                                                                                                       datasets::SmallShapes(),
-                                                                                                                       ArithmeticSubtractionQASYMM8SIGNEDDataset),
+                                                                                                                       datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                                                                                                                    ArithmeticSubtractionQuantizationInfoSignedDataset),
-                                                                                                                   InPlaceDataSet))
+                                                                                                                   OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
                            datasets::SmallShapesBroadcast(),
-                           ArithmeticSubtractionQASYMM8SIGNEDDataset),
+                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        ArithmeticSubtractionQuantizationInfoSignedDataset),
                        OutOfPlaceDataSet))
@@ -242,12 +209,22 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBr
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
+                           datasets::TinyShapesBroadcastInplace(),
+                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                       ArithmeticSubtractionQuantizationInfoSignedInPlaceDataset),
+                       InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
         datasets::SmallShapes(),
-        ArithmeticSubtractionQSYMM16Dataset),
+        framework::dataset::make("DataType", DataType::QSYMM16)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                                                                                                                      ArithmeticSubtractionQuantizationInfoSymmetric),
                                                                                                              OutOfPlaceDataSet))
@@ -259,7 +236,8 @@ TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS16Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::S16)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      OutOfPlaceDataSet))
 {
@@ -267,7 +245,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framew
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS16Dataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::S16)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                    OutOfPlaceDataSet))
 {
@@ -277,7 +256,8 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framew
 TEST_SUITE_END() // S16
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS32Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::S32)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      OutOfPlaceDataSet))
 {
@@ -285,7 +265,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int32_t>, framew
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS32Dataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::S32)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                    OutOfPlaceDataSet))
 {
@@ -297,7 +278,8 @@ TEST_SUITE_END() // S32
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                    DataType::F16)),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                             OutOfPlaceDataSet))
 {
@@ -308,7 +290,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::F32)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                    InPlaceDataSet))
 {
@@ -316,7 +299,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framewor
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)),
                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                  OutOfPlaceDataSet))
 {
@@ -328,7 +312,7 @@ template <typename T>
 using NEArithmeticSubtractionBroadcastFixture = ArithmeticSubtractionBroadcastValidationFixture<Tensor, Accessor, NEArithmeticSubtraction, T>;
 
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ArithmeticSubtractionFP32Dataset),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                        OutOfPlaceDataSet))
 {
@@ -337,7 +321,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixtur
 }
 
 FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
-                       ArithmeticSubtractionFP32Dataset),
+                       framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                        OutOfPlaceDataSet))
 {
@@ -348,7 +332,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ArithmeticSubtraction
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BatchNormalizationLayer.cpp b/tests/validation/NEON/BatchNormalizationLayer.cpp
index 067c5bb198..50eaf0c667 100644
--- a/tests/validation/NEON/BatchNormalizationLayer.cpp
+++ b/tests/validation/NEON/BatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,8 +51,9 @@ namespace
 RelativeTolerance<float>           rel_tolerance_f32(0.05f);   /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr AbsoluteTolerance<float> abs_tolerance_f32(0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr AbsoluteTolerance<float> tolerance_f16(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
-#endif                                                   // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr AbsoluteTolerance<float> abs_tolerance_f16(0.015f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+#endif                                                       // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
 const auto act_infos = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
@@ -148,7 +149,7 @@ FIXTURE_DATA_TEST_CASE(RandomSmall, NEBatchNormalizationLayerFixture<half>, fram
                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16, 0);
+    validate(Accessor(_target), _reference, abs_tolerance_f16, 0);
 }
 
 FIXTURE_DATA_TEST_CASE(RandomLarge, NEBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::LargeRandomBatchNormalizationLayerDataset(),
@@ -159,7 +160,7 @@ FIXTURE_DATA_TEST_CASE(RandomLarge, NEBatchNormalizationLayerFixture<half>, fram
                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16, 0);
+    validate(Accessor(_target), _reference, abs_tolerance_f16, 0);
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -219,7 +220,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // BatchNormalizationLayerFusion
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BatchToSpaceLayer.cpp b/tests/validation/NEON/BatchToSpaceLayer.cpp
index ff78ce8fa4..8cf11b7b95 100644
--- a/tests/validation/NEON/BatchToSpaceLayer.cpp
+++ b/tests/validation/NEON/BatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,55 +49,38 @@ using NEBatchToSpaceLayerFixture = BatchToSpaceLayerValidationFixture<Tensor, Ac
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blockx > blocky
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blocky > blockx
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),     // Mismatching data types
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),     // Wrong data type block shape
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U, 4U), 1, DataType::F32), // Wrong tensor shape
-                                                     }),
-               framework::dataset::make("BlockShapeInfo",{ TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 4U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(4U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                     })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(64U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 32U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                     })),
-               framework::dataset::make("Expected", { true, true, true, false, false, false})),
-               input_info, block_shape_info, output_info, expected)
-{
-    bool has_error = bool(NEBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), &block_shape_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false)));
-    ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
-}
-DATA_TEST_CASE(ValidateStatic, framework::DatasetMode::ALL, zip(zip(zip(zip(
+DATA_TEST_CASE(ValidateStatic, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blockx > blocky
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blocky > blockx
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),    // Negative block shapes
-                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U, 4U), 1, DataType::F32), // Wrong tensor shape
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: blockx != blocky && blockx > blocky
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: blockx != blocky && blocky > blockx
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),     // Invalid: Mismatching data types
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),     // Invalid: Negative block shapes
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U, 4U), 1, DataType::F32),// Unsupported tensor rank
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid output tensor shape (invalid batch dimension)
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid output tensor shape (invalid spatial dimension)
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: correct tensor shape with cropping
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid tensor shape with cropping
                                                      }),
-               framework::dataset::make("BlockShapeX", { 2, 4, 2, 2, 2, 2 })),
-               framework::dataset::make("BlockShapeY", { 2, 2, 4, 2, -2, 2 })),
+               framework::dataset::make("BlockShapeX", { 2, 4, 2, 2, 2, 2, 2, 2, 2, 2 })),
+               framework::dataset::make("BlockShapeY", { 2, 2, 4, 2, -2, 2, 2, 2, 2, 2 })),
+               framework::dataset::make("CropInfo", {
+                CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{3, 2, 1, 3}, CropInfo{3, 2, 1, 3}
+               })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(64U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 32U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 16U, 2U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 32U, 2U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 8U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(33U, 32U, 2U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(27, 12U, 2U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, true, false, false, false})),
-               input_info, block_shape_x, block_shape_y, output_info, expected)
+               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true, false})),
+               input_info, block_shape_x, block_shape_y, crop_info, output_info, expected)
 {
-    bool has_error = bool(NEBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), block_shape_x, block_shape_y, &output_info.clone()->set_is_resizable(false)));
+    bool has_error = bool(NEBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), block_shape_x, block_shape_y, &output_info.clone()->set_is_resizable(false), crop_info));
     ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -112,6 +95,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchToSpaceLayerFixture<float>, framework::D
     // Validate output
     validate(Accessor(_target), _reference);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallWithCropping, NEBatchToSpaceLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallBatchToSpaceLayerWithCroppingDataset(), framework::dataset::make("DataType",
+                                                                                                                       DataType::F32)),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEBatchToSpaceLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeBatchToSpaceLayerDataset(), framework::dataset::make("DataType",
                                                                                                                      DataType::F32)),
                                                                                                              framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
@@ -129,6 +122,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchToSpaceLayerFixture<half>, framework::Da
     // Validate output
     validate(Accessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallWithCropping, NEBatchToSpaceLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallBatchToSpaceLayerWithCroppingDataset(), framework::dataset::make("DataType",
+                                                                                                                       DataType::F16)),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEBatchToSpaceLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeBatchToSpaceLayerDataset(), framework::dataset::make("DataType",
                                                                                                                     DataType::F16)),
                                                                                                             framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
@@ -140,7 +142,7 @@ TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE_END() // BatchToSpace
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BitwiseAnd.cpp b/tests/validation/NEON/BitwiseAnd.cpp
index 8796cc1147..eff649e87a 100644
--- a/tests/validation/NEON/BitwiseAnd.cpp
+++ b/tests/validation/NEON/BitwiseAnd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBitwiseAndFixture<uint8_t>, framework::Datase
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // BitwiseAnd
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BitwiseNot.cpp b/tests/validation/NEON/BitwiseNot.cpp
index 13b3e6d7f3..73b49b6ff1 100644
--- a/tests/validation/NEON/BitwiseNot.cpp
+++ b/tests/validation/NEON/BitwiseNot.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBitwiseNotFixture<uint8_t>, framework::Datase
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // BitwiseNot
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BitwiseOr.cpp b/tests/validation/NEON/BitwiseOr.cpp
index cef712f626..61d1273b37 100644
--- a/tests/validation/NEON/BitwiseOr.cpp
+++ b/tests/validation/NEON/BitwiseOr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBitwiseOrFixture<uint8_t>, framework::Dataset
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // BitwiseOr
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BitwiseXor.cpp b/tests/validation/NEON/BitwiseXor.cpp
index 3f973935df..727052ed6b 100644
--- a/tests/validation/NEON/BitwiseXor.cpp
+++ b/tests/validation/NEON/BitwiseXor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBitwiseXorFixture<uint8_t>, framework::Datase
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // BitwiseXor
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BoundingBoxTransform.cpp b/tests/validation/NEON/BoundingBoxTransform.cpp
index c662c4107c..2ca2434150 100644
--- a/tests/validation/NEON/BoundingBoxTransform.cpp
+++ b/tests/validation/NEON/BoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -153,7 +153,7 @@ TEST_SUITE_END() // QASYMM16
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // BBoxTransform
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Box3x3.cpp b/tests/validation/NEON/Box3x3.cpp
deleted file mode 100644
index 97122f1fe2..0000000000
--- a/tests/validation/NEON/Box3x3.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Box3x3Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Box3x3)
-
-template <typename T>
-using NEBox3x3Fixture = Box3x3ValidationFixture<Tensor, Accessor, NEBox3x3, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEBox3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEBox3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/CannyEdge.cpp b/tests/validation/NEON/CannyEdge.cpp
deleted file mode 100644
index ed46ec16b6..0000000000
--- a/tests/validation/NEON/CannyEdge.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ImageFileDatasets.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/CannyEdgeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Allowed ratio of mismatches between target and reference (1.0 = 100%) */
-const float allowed_mismatch_ratio = 0.1f;
-
-const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
-                          combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), datasets::BorderModes()));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(CannyEdge)
-
-template <typename T>
-using NECannyEdgeFixture = CannyEdgeValidationFixture<Tensor, Accessor, KeyPointArray, NECannyEdge, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NECannyEdgeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallImageFiles(), data), framework::dataset::make("Format", Format::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NECannyEdgeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeImageFiles(), data), framework::dataset::make("Format", Format::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Cast.cpp b/tests/validation/NEON/Cast.cpp
index c4c350a536..b56594546b 100644
--- a/tests/validation/NEON/Cast.cpp
+++ b/tests/validation/NEON/Cast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ConvertPolicyDataset.h"
@@ -34,7 +37,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/CastFixture.h"
-
 namespace arm_compute
 {
 namespace test
@@ -99,6 +101,11 @@ const auto CastF32toS32Dataset            = combine(framework::dataset::make("Da
 const auto CastF32toQASYMM8Dataset        = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
 const auto CastF32toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
 
+// U64
+const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32));
+
+// S64
+const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32));
 } // namespace
 
 TEST_SUITE(NEON)
@@ -106,6 +113,8 @@ TEST_SUITE(Cast)
 template <typename T>
 using NECastToU8Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint8_t>;
 template <typename T>
+using NECastToS8Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int8_t>;
+template <typename T>
 using NECastToU16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint16_t>;
 template <typename T>
 using NECastToS16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int16_t>;
@@ -114,6 +123,10 @@ using NECastToU32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, ui
 template <typename T>
 using NECastToS32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int32_t>;
 template <typename T>
+using NECastToU64Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint64_t>;
+template <typename T>
+using NECastToS64Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int64_t>;
+template <typename T>
 using NECastToF16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, half>;
 template <typename T>
 using NECastToF32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, float>;
@@ -187,8 +200,68 @@ CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, NECastToF16Fixture<float>,
 CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
 CAST_SUITE(F32_to_U8, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
 
+#ifdef __aarch64__
+// S64
+CAST_SUITE(S64_to_F32, DataType::S64, DataType::F32, NECastToF32Fixture<int64_t>, CastS64toF32Dataset, zero_tolerance)
+
+// U64
+CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, NECastToF32Fixture<uint64_t>, CastU64toF32Dataset, zero_tolerance)
+#endif // __aarch64__
+
+DATA_TEST_CASE(KernelSelectionDstFP16, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType",
+{
+    DataType::F16,
+    DataType::U8,
+    DataType::S32,
+    DataType::QASYMM8,
+    DataType::QASYMM8_SIGNED,
+})),
+cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+    const CpuCastKernel::CastKernel *selected_impl;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = true;
+
+    selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ data_type, DataType::F16, cpu_isa }, cpu::KernelSelectionType::Preferred);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_cast";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(KernelSelectionSrcFP32, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType",
+{
+    DataType::F16,
+})),
+cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ DataType::F32, data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_fp32_to_" + cpu_impl_dt(data_type) + "_cast";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE_END() // Cast
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ChannelCombine.cpp b/tests/validation/NEON/ChannelCombine.cpp
deleted file mode 100644
index d17f9615e1..0000000000
--- a/tests/validation/NEON/ChannelCombine.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/MultiImage.h"
-#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ChannelCombineFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(ChannelCombine)
-
-template <typename T>
-using NEChannelCombineFixture = ChannelCombineValidationFixture<MultiImage, Tensor, Accessor, NEChannelCombine, T>;
-
-TEST_SUITE(RGBA)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // RGBA
-
-TEST_SUITE(YUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUV
-
-TEST_SUITE(YUVPlanar)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUVPlanar
-
-TEST_SUITE_END() // ChannelCombine
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ChannelExtract.cpp b/tests/validation/NEON/ChannelExtract.cpp
deleted file mode 100644
index 559a014e5d..0000000000
--- a/tests/validation/NEON/ChannelExtract.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/MultiImage.h"
-#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ChannelExtractFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-// Input data sets
-const auto ChannelExtractRGBADataset = combine(framework::dataset::make("FormatType", { Format::RGBA8888 }),
-                                               framework::dataset::make("ChannelType", { Channel::R, Channel::G, Channel::B, Channel::A }));
-const auto ChannelExtractYUVDataset = combine(framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 }),
-                                              framework::dataset::make("ChannelType", { Channel::Y, Channel::U, Channel::V }));
-const auto ChannelExtractYUVPlanarDataset = combine(framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444, Format::NV12, Format::NV21 }),
-                                                    framework::dataset::make("ChannelType", { Channel::Y, Channel::U, Channel::V }));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(ChannelExtract)
-
-template <typename T>
-using NEChannelExtractFixture = ChannelExtractValidationFixture<MultiImage, Tensor, Accessor, NEChannelExtract, T>;
-
-TEST_SUITE(RGBA)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ChannelExtractRGBADataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ChannelExtractRGBADataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // RGBA
-
-TEST_SUITE(YUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ChannelExtractYUVDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ChannelExtractYUVDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // YUV
-
-TEST_SUITE(YUVPlanar)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ChannelExtractYUVPlanarDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ChannelExtractYUVPlanarDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // YUVPlanar
-
-TEST_SUITE_END() // ChannelExtract
-TEST_SUITE_END() // NEON
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ChannelShuffle.cpp b/tests/validation/NEON/ChannelShuffle.cpp
index d0fa82fc53..9a2a9f24f0 100644
--- a/tests/validation/NEON/ChannelShuffle.cpp
+++ b/tests/validation/NEON/ChannelShuffle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -134,7 +134,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ChannelShuffle
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Col2Im.cpp b/tests/validation/NEON/Col2Im.cpp
index 5f98b455d9..7eb8cbf0f6 100644
--- a/tests/validation/NEON/Col2Im.cpp
+++ b/tests/validation/NEON/Col2Im.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,8 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NECol2Im.h"
+#include "src/cpu/kernels/CpuCol2ImKernel.h"
+#include "tests/NEON/Helper.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -38,6 +39,8 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(Col2Im)
 
+using CpuCol2Im = NESynthetizeFunction<cpu::kernels::CpuCol2ImKernel>;
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
@@ -56,7 +59,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                framework::dataset::make("Expected", { false, false, false, true })),
                input_info, output_info, convolved_width, convolved_height, expected)
 {
-    bool status = bool(NECol2Im::validate(&input_info, &output_info, Size2D(convolved_width, convolved_height)));
+    bool status = bool(CpuCol2Im::validate(&input_info, &output_info, Size2D(convolved_width, convolved_height)));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
diff --git a/tests/validation/NEON/ColorConvert.cpp b/tests/validation/NEON/ColorConvert.cpp
deleted file mode 100644
index ba4df97907..0000000000
--- a/tests/validation/NEON/ColorConvert.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/MultiImage.h"
-#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ColorConvertFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_nv(2);
-constexpr AbsoluteTolerance<uint8_t> tolerance_u8(2);
-
-// Input data sets
-const auto RGBDataset  = framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 });
-const auto YUYVDataset = framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 });
-
-const auto ColorConvert_RGBA_to_RGB = combine(framework::dataset::make("FormatType", { Format::RGBA8888 }),
-                                              framework::dataset::make("FormatType", { Format::RGB888 }));
-
-const auto ColorConvert_RGB_to_RGBA = combine(framework::dataset::make("FormatType", { Format::RGB888 }),
-                                              framework::dataset::make("FormatType", { Format::RGBA8888 }));
-
-const auto ColorConvert_RGB_to_U8 = combine(framework::dataset::make("FormatType", { Format::RGB888 }),
-                                            framework::dataset::make("FormatType", { Format::U8 }));
-
-const auto ColorConvert_YUYVDataset_to_RGBDataset = combine(YUYVDataset,
-                                                            RGBDataset);
-
-const auto ColorConvert_YUVPlanar_to_RGBDataset = combine(framework::dataset::make("FormatType", { Format::IYUV, Format::NV12, Format::NV21 }),
-                                                          RGBDataset);
-
-const auto ColorConvert_RGBDataset_to_NVDataset = combine(RGBDataset,
-                                                          framework::dataset::make("FormatType", { Format::NV12, Format::IYUV, Format::YUV444 }));
-
-const auto ColorConvert_YUYVDataset_to_NVDataset = combine(YUYVDataset,
-                                                           framework::dataset::make("FormatType", { Format::NV12, Format::IYUV }));
-
-const auto ColorConvert_NVDataset_to_YUVDataset = combine(framework::dataset::make("FormatType", { Format::NV12, Format::NV21 }),
-                                                          framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444 }));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(ColorConvert)
-
-template <typename T>
-using NEColorConvertFixture = ColorConvertValidationFixture<MultiImage, Tensor, Accessor, NEColorConvert, T>;
-
-TEST_SUITE(RGBA)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_RGBA_to_RGB))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGBA_to_RGB))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // RGBA
-
-TEST_SUITE(RGB)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_RGBA))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGB_to_RGBA))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // RGB
-
-TEST_SUITE(RGBtoU8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_U8))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx], tolerance_u8);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGB_to_U8))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx], tolerance_u8);
-    }
-}
-TEST_SUITE_END() // RGBtoU8
-
-TEST_SUITE(YUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_YUYVDataset_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUV
-
-TEST_SUITE(YUVPlanar)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_YUVPlanar_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_YUVPlanar_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUVPlanar
-
-TEST_SUITE(NV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_RGBDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx], tolerance_nv);
-    }
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGBDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx], tolerance_nv);
-    }
-}
-TEST_SUITE_END() // NV
-
-TEST_SUITE(YUYVtoNV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_YUYVDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUYVtoNV
-
-TEST_SUITE(NVtoYUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), ColorConvert_NVDataset_to_YUVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_NVDataset_to_YUVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // NVtoYUV
-
-TEST_SUITE_END() // ColorConvert
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ConvertFullyConnectedWeights.cpp b/tests/validation/NEON/ConvertFullyConnectedWeights.cpp
index e0fefe30e3..65dbbcc55b 100644
--- a/tests/validation/NEON/ConvertFullyConnectedWeights.cpp
+++ b/tests/validation/NEON/ConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -131,7 +131,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEConvertFullyConnectedWeightsFixture<uint8_t>,
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE_END() // ConvertFullyConnectedWeights
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Convolution.cpp b/tests/validation/NEON/Convolution.cpp
deleted file mode 100644
index 13bc34c995..0000000000
--- a/tests/validation/NEON/Convolution.cpp
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ConvolutionFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance value for comparing reference's output against implementation
- *
- * This is due to the fact that NEON target performs multiplication with reciprocal of scale,
- * while reference performs direct division with scale.
- */
-constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
-constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(CustomConvolution)
-TEST_SUITE(Square3x3)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution3x3, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 3 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 3 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square3x3
-
-TEST_SUITE(Square5x5)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution5x5, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square5x5
-
-TEST_SUITE(Square7x7)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution7x7, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square7x7
-
-TEST_SUITE(Square9x9)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution9x9, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square9x9
-
-TEST_SUITE(Rectangle)
-template <typename T>
-using NEConvolutionFixture = ConvolutionRectangleValidationFixture<Tensor, Accessor, NEConvolutionRectangle, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                                 framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-                                                                                                         framework::dataset::make("filter_height", { 3, 5, 7, 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                                 framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-                                                                                                         framework::dataset::make("filter_height", { 3, 5, 7, 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Rectangle
-
-TEST_SUITE(Separable5x5)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSeparableValidationFixture<Tensor, Accessor, NEConvolution5x5, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable5x5
-
-TEST_SUITE(Separable7x7)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSeparableValidationFixture<Tensor, Accessor, NEConvolution7x7, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable7x7
-
-TEST_SUITE(Separable9x9)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSeparableValidationFixture<Tensor, Accessor, NEConvolution9x9, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable9x9
-
-TEST_SUITE_END() // CustomConvolution
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Convolution3D.cpp b/tests/validation/NEON/Convolution3D.cpp
new file mode 100644
index 0000000000..4185488742
--- /dev/null
+++ b/tests/validation/NEON/Convolution3D.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEConv3D.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/DirectConvolution3DFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for FP16 types */
+const AbsoluteTolerance<float>            abs_tolerance_f16(0.2f);                   /**< Absolute tolerance for FP16 types */
+constexpr float                           tolerance_num = 0.07f;                     /**< Tolerance number for the FP16 implementation */
+#endif                                                                               /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<float>   tolerance_fp32(0.001f);                         /**< Tolerance for floating point tests */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);                           /**< Tolerance for quantized tests */
+
+/** Activation function Dataset*/
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+});
+
+const auto data_precommit = combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                    datasets::SmallDirectConv3DShapes(),
+                                                                                    framework::dataset::make("StrideX", { 1, 5, 8 })),
+                                                                                framework::dataset::make("StrideY", { 1, 2, 3 })),
+                                                                            framework::dataset::make("StrideZ", { 1, 2, 1 })),
+                                                                        framework::dataset::make("PadX", { 0, 1, 2 })),
+                                                                    framework::dataset::make("PadY", { 0, 2, 1 })),
+                                                                framework::dataset::make("PadZ", { 0, 3, 5 })),
+                                                            framework::dataset::make("KernelWidth", { 3, 5, 9 })),
+                                                        framework::dataset::make("KernelHeight", { 2, 1, 3 })),
+                                                    framework::dataset::make("KernelDepth", { 1, 2, 3 })),
+                                                framework::dataset::make("NumKernels", { 2, 3, 8 })),
+                                            framework::dataset::make("HasBias", { true, false })),
+                                    ActivationFunctionsDataset);
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(Convolution3D)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Mismatching data type input/weights
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Mismatching input feature maps
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid weights dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NHWC), // Invalid data layout
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid biases size
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid biases dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid output size
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::U32, DataLayout::NDHWC), // Invalid data type
+                                              }),
+        framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F16),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 3U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U, 3U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::U32),
+                                              })),
+        framework::dataset::make("BiasesInfo",{ TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(3U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U, 2U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                              })),
+        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(26U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::U32),
+                                              })),
+        framework::dataset::make("Expected", { false, false, false, false, false, false, false, false})),
+        input_info, weights_info, biases_info, output_info, expected)
+{
+        const Conv3dInfo  conv3d_info(Size3D(1, 1, 1), Padding3D(0, 0, 0), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false);
+        bool is_valid = bool(NEConv3D::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv3d_info));
+        ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEDirectConvolution3DFixture = DirectConvolution3DValidationFixture<Tensor, Accessor, NEConv3D, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(data_precommit,
+                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NDHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // FP32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(data_precommit,
+                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                framework::dataset::make("DataLayout", { DataLayout::NDHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE_END() // Float
+
+template <typename T>
+using NEDirectConvolution3DQuantizedFixture = DirectConvolution3DValidationQuantizedFixture<Tensor, Accessor, NEConv3D, T>;
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE_END() // Convolution3D
+TEST_SUITE_END() // Neon
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 9dff36b139..d739d4e1a4 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,11 +28,16 @@
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
+
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/LargeConvolutionLayerDataset.h"
 #include "tests/datasets/SmallConvolutionLayerDataset.h"
-#include "tests/datasets/TinyConvolutionLayerDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -46,6 +51,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace detail
 {
 template <>
@@ -77,10 +84,17 @@ const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2
 const AbsoluteTolerance<float>            abs_tolerance_f16(0.2f);                   /**< Absolute tolerance for FP16 types */
 constexpr float                           tolerance_num = 0.07f;                     /**< Tolerance number for the FP16 implementation */
 #endif                                                                               /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+
+#ifdef ARM_COMPUTE_ENABLE_SME
+// TODO(COMPMID-6011): SME kernels and the reference model use different rounding mode.
+// Temporarily increase the tolerance for quantized data.
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(1.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#else                                                      // ARM_COMPUTE_ENABLE_SME
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#endif                                                     // ARM_COMPUTE_ENABLE_SME
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -88,14 +102,41 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
     DataType::QASYMM8,
 });
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f)
 });
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto NoActivation = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+});
+
+const auto ActivationFunctionsDatasetNightly = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+#ifdef __aarch64__
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU),
+#endif // __aarch64__
+});
+
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(0.5f, 10),
     QuantizationInfo(0.3f, 3),
@@ -110,32 +151,32 @@ TEST_SUITE(ConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-                                          framework::dataset::make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F32),
+                                          make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(23U, 27U, 32U, 4U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)
                                           }),
-                                          framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F32),
+                                          make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 32U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
                                           })),
-                                          framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
+                                          make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(19U, 23U, 21U, 4U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
                                           })),
-                                          framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                          make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(2, 1, 0, 0),
                                                                                  PadStrideInfo(3, 2, 1, 0)
                                           })),
-                                          framework::dataset::make("FastMath", { true,
+                                          make("FastMath", { true,
                                                                                  true,
                                                                                  false,
                                                                                  false
                                           })),
-                                                                           framework::dataset::make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
+                                                                           make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
                input_info, weights_info, output_info, conv_info, fast_math, expected)
 {
     ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
@@ -147,30 +188,267 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
 // *INDENT-ON*
 TEST_SUITE_END() // ConvolutionLayer
 
+/*
+    Testing Strategy of Neon Winograd:
+        - There is no need to thoroughly test nchw cases because winograd kernels accept
+          nhwc and the tensors are permuted before and after if they're nchw.
+        - Except relu and bounded relu, testing activations for a single input
+          combination is enough because activation is not fused into winograd and called
+          separately.
+*/
 TEST_SUITE(WinogradLayer)
 template <typename T>
 using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T>;
+template <typename T>
+using NEWinogradConvolutionLayerMixedDataLayoutFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T, T, true, true>;
 
 template <typename T>
 using NEWinogradConvolutionLayerNoBiasFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T, T, false>;
 
+/** Test case for memory injection in @ref cpu::CpuWinogradConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto                winograd = std::make_unique<cpu::CpuWinogradConv2d>();
+    const auto          src_info = TensorInfo(TensorShape(8U, 8U, 32U), 1, DataType::F32);
+    const auto          w_info   = TensorInfo(TensorShape(1U), 1, DataType::F32);
+    const auto          b_info   = TensorInfo(TensorShape(1U, 3U, 32U, 1U), 1, DataType::F32);
+    auto                dst_info = TensorInfo(TensorShape(8U, 6U, 1U), 1, DataType::F32);
+    const PadStrideInfo pad_info{};
+
+    winograd->configure(&src_info, &b_info, &w_info, &dst_info, pad_info);
+
+    // telhs are newly created every call of this lambda function
+    auto a = create_tensor<Tensor>(src_info);
+    auto b = create_tensor<Tensor>(b_info);
+    auto c = create_tensor<Tensor>(w_info);
+    a.allocator()->allocate();
+    b.allocator()->allocate();
+    c.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &a }, { TensorType::ACL_SRC_1, &b }, { TensorType::ACL_SRC_2, &c } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &b }, { TensorType::ACL_SRC_2, &c } };
+
+    auto mg       = MemoryGroup{};
+    auto ws       = manage_workspace<Tensor>(winograd->workspace(), mg, run_pack, prep_pack);
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+        library->fill_tensor_value(Accessor(a), 1.f);
+        library->fill_tensor_value(Accessor(b), 2.f);
+        library->fill_tensor_value(Accessor(c), 3.f);
+
+        // This operator is configured once and captured by this lambda.
+        winograd->prepare(prep_pack);
+        winograd->run(run_pack);
+        return dst;
+    };
+
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEWinogradConvolutionLayer.
+ *
+ * Make sure @ref NEWinogradConvolutionLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto                gemm     = std::make_unique<NEWinogradConvolutionLayer>();
+    const auto          src_info = TensorInfo(TensorShape(8U, 8U, 32U), 1, DataType::F32);
+    const auto          w_info   = TensorInfo(TensorShape(1U), 1, DataType::F32);
+    const auto          b_info   = TensorInfo(TensorShape(1U, 3U, 32U, 1U), 1, DataType::F32);
+    auto                dst_info = TensorInfo(TensorShape(8U, 6U, 1U), 1, DataType::F32);
+    const PadStrideInfo pad_info{};
+
+    auto run_conv = [&]()
+    {
+        auto src = create_tensor<Tensor>(src_info);
+        auto w   = create_tensor<Tensor>(w_info);
+        auto b   = create_tensor<Tensor>(b_info);
+        auto dst = create_tensor<Tensor>(dst_info);
+
+        gemm->configure(&src, &b, &w, &dst, pad_info);
+
+        src.allocator()->allocate();
+        b.allocator()->allocate();
+        w.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(b), 2.f);
+        library->fill_tensor_value(Accessor(w), 3.f);
+        gemm->run();
+        return dst;
+    };
+
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+DATA_TEST_CASE(SupportedKernels, framework::DatasetMode::ALL, zip(
+                   make("WeightsInfo",
+{
+    // Shapes are always in NCHW format. When layout is NHWC, the shape is permuted
+
+    // Fp32, NCHW/NHWC (layout does not matter as it's )
+    // 3x1, 1x3, 3x3 --> all TRUE
+    TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // 5x1, 1x5, 5x5 --> all TRUE
+    TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // 7x1, 1x7, 7x7
+    //  --> all FALSE
+    TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+    // unsupported kernel sizes
+    TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // Fp16
+    TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+    // 5x1, 1x5, 5x5 --> all TRUE
+    TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+    // 7x1, 1x7, 7x7
+    //  --> all FALSE
+    TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+
+    // unsupported kernel sizes
+    TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+}),
+make("Expected",
+{
+    // fp32
+    true, true, true,    // 3x3, 1x3, 3x1
+    true, true, true,    // 5x5, 1x5, 5x1
+    false, true, true,   // 7x7, 1x7, 7x1
+    false, false, false, // random unsupported kernels
+
+    // fp16
+    true, false, false,  // 3x3, 1x3, 3x1
+    false, false, false, // 5x5, 1x5, 5x1
+    false, false, false, // 7x7, 1x7, 7x1
+    false, false, false, // random unsupported kernels
+})),
+weights_info_const, expected_const)
+{
+    DataType   data_type   = weights_info_const.data_type();
+    DataLayout data_layout = weights_info_const.data_layout();
+
+    TensorInfo input_info   = TensorInfo(TensorShape(17U, 31U, 2U), 1, data_type);
+    TensorInfo bias_info    = TensorInfo(TensorShape(8U), 1, data_type);
+    TensorInfo weights_info = weights_info_const;
+
+    if(data_layout == DataLayout::NHWC)
+    {
+        // Convert to NHWC
+        PermutationVector perm = PermutationVector(2U, 0U, 1U);
+
+        TensorShape input_shape   = input_info.tensor_shape();
+        TensorShape weights_shape = weights_info.tensor_shape();
+        permute(input_shape, perm);
+        permute(weights_shape, perm);
+
+        input_info.set_tensor_shape(input_shape);
+        weights_info.set_tensor_shape(weights_shape);
+
+        input_info.set_data_layout(data_layout);
+        weights_info.set_data_layout(data_layout);
+        bias_info.set_data_layout(data_layout);
+    }
+
+    PadStrideInfo conv_info(1, 1, 0, 0);
+
+    TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, conv_info);
+    TensorInfo  output_info  = TensorInfo(output_shape, 1, data_type, data_layout);
+
+    Status status = NEWinogradConvolutionLayer::validate(
+                        &input_info,
+                        &weights_info,
+                        &bias_info,
+                        &output_info,
+                        conv_info,
+                        ActivationLayerInfo(),
+                        true /* fast math */);
+
+    Status fp16_supported = ::arm_compute::error_on_unsupported_cpu_fp16("N/A", "N/A", 0, &input_info);
+    bool   expected       = expected_const && static_cast<bool>(fp16_supported);
+
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(FP32)
 
 TEST_SUITE(Conv1x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEWinogradConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("Input", TensorShape(8U, 8U, 32U)),
+                           make("Weight", TensorShape(1U, 3U, 32U, 1U)),
+                           make("Bias", TensorShape(1U)),
+                           make("Output", TensorShape(8U, 6U, 1U)),
+                           make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                           make("Dilation", Size2D(1U, 1U)),
+                           make("DataType", { DataType::F32 }),
+                           ActivationFunctionsDataset,
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -180,19 +458,19 @@ TEST_SUITE_END() // Conv1x3
 
 TEST_SUITE(Conv3x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -202,19 +480,19 @@ TEST_SUITE_END() // Conv3x1
 
 TEST_SUITE(Conv1x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -224,19 +502,19 @@ TEST_SUITE_END() // Conv1x5
 
 TEST_SUITE(Conv5x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -246,10 +524,10 @@ TEST_SUITE_END() // Conv5x1
 
 TEST_SUITE(Conv7x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
@@ -257,9 +535,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                                               make("DataType", { DataType::F32 })),
+                                       make("ActivationInfo", { ActivationLayerInfo() })),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -268,20 +546,20 @@ TEST_SUITE_END() // Conv7x1
 
 TEST_SUITE(Conv1x7)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -290,20 +568,40 @@ TEST_SUITE_END() // Conv1x7
 
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
+
+/// It's enough to run the activations for a single weight/input combination and data type because
+/// activation function is called on top of the winograd output as a separate operator
+/// TODO: Enable after COMPMID-6573 is resolved
+FIXTURE_DATA_TEST_CASE(RunActivations, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::DISABLED,
+                       combine(
+                           make("Input", TensorShape(3U, 3U, 32U)),
+                           make("Weight", TensorShape(3U, 3U, 32U, 4U)),
+                           make("Bias", TensorShape(4U)),
+                           make("Output", TensorShape(1U, 1U, 4U)),
+                           make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                           make("Dilation", Size2D(1U, 1U)),
+                           make("DataType", { DataType::F32 }),
+                           ActivationFunctionsDatasetNightly,
+                           make("DataLayout", { DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, abs_tolerance_f32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -314,20 +612,20 @@ TEST_SUITE_END() // Conv3x3
 
 TEST_SUITE(Conv5x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -337,12 +635,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, frame
 TEST_SUITE_END() // Conv5x5
 
 FIXTURE_DATA_TEST_CASE(RunSmallNoBias, NEWinogradConvolutionLayerNoBiasFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                                                          datasets::SmallWinogradConvolutionLayer5x5Dataset()),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(framework::dataset::concat(
+                                   datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                   datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
@@ -354,12 +652,39 @@ TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 using CLWinogradConvolutionLayerFastMathFixture16 = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, half, float>;
 
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(
+                   make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16),
+                                       TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16)
+                                     }),
+                   make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16),
+                                         TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16)
+                                       }),
+                   make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
+                                        TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F16)
+                                      }),
+                   make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                      PadStrideInfo(1, 1, 0, 0)
+                                    }),
+                   make("FastMath",
+{
+    false, // case fp16 and fast_math False then disable Winograd
+    true   // case fp16 and fast_math True then enable Winograd
+}),
+make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD })),
+input_info, weights_info, output_info, conv_info, fast_math, expected)
+{
+    ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+                                                                            &weights_info.clone()->set_is_resizable(true),
+                                                                            &output_info.clone()->set_is_resizable(true), conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), fast_math);
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F16 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -367,10 +692,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F16 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -381,16 +706,470 @@ TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // WinogradLayer
 
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+TEST_SUITE(FIXED_FORMAT_KERNELS)
+TEST_SUITE(VariableWeightUtils)
+
+// UC2_1_* tests: the user requests a specific fixed format, but there is no kernel that supports it.
+
+template <typename ConvolutionClass>
+using HasOptImplFixtureNoFastMath = HasOptImplFixture<ConvolutionClass, /*enable_fast_math*/ false>;
+
+template <typename ConvolutionClass>
+using HasOptImplFixtureFastMath = HasOptImplFixture<ConvolutionClass, /*enable_fast_math*/ true>;
+
+// UC2_1
+
+FIXTURE_DATA_TEST_CASE(UC2_1_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+FIXTURE_DATA_TEST_CASE(UC2_1_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_1_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_1_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+// UC2_2_* tests: the user requests a specific fixed format, and a
+// kernel that support that fixed format is found.
+
+FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo4 })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo4, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo4 })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo4, framework::LogLevel::ERRORS);
+}
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+// These tests currently only works with SVE length 256
+// If other SVE length is used a kernel will fail to be found
+// This needs to be addressed in order to ensure it doesn't revert to FP32 kernels for systems with SVE length other than 256
+FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
+{
+    if(Scheduler::get().cpu_info().has_bf16() && (arm_gemm::utils::get_vector_length<float>() == 8)){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT_EQUAL(_computed_weight_format, arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
+{
+    if(Scheduler::get().cpu_info().has_bf16() && (arm_gemm::utils::get_vector_length<float>() == 8)){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+    }
+}
+
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+// UC3_1_* tests: the user queries for ANY fixed format, but there is
+// no kernel that support the use case specified by the user (for
+// example, there is no fixed format kernel for the datatype of the
+// problem).
+
+FIXTURE_DATA_TEST_CASE(UC3_1_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+// UC3_2_* tests: the user queries for ANY fixed format. The search
+// succeeded and the fixed format found is prompted back for
+// consumption by the user. Note that we just test the
+// _computed_weight_format to be anything but not the formats that are
+// not fixed formats (ANY and UNSPECIFIED). This is because the weight
+// format that the runtime produces depends on the size of the vector
+// units of the hardware where the tests is executed. For example, a
+// format like OHWIo4 for FP32 data returned for 128-bit NEON hardware
+// is replaced by OHWIo8 when running on 256-bit SVE.
+
+FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+}
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+
+FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    if(Scheduler::get().cpu_info().has_bf16()){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    if(Scheduler::get().cpu_info().has_bf16()){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+}
+
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+namespace
+{
+using TestCaseType          = std::tuple<TensorShape, TensorShape, arm_compute::WeightFormat>;
+auto prepare_weights_shapes = framework::dataset::make("TensorShape",
+{
+    // OHWIo<interleave_by>i<block_by>
+    //
+    // OHWI --> O'HWI', where:
+    //
+    //   O'= smallest multiple of <interleave_by> such that O<=O'
+    //   I'= smallest multiple of <block_by> such that I<=I'
+    //
+
+    // Change N for OHWIo4
+    TestCaseType({ { 1U, 1U, 1U, 1U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 2U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 3U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 4U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 5U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 6U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 7U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 8U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 9U }, { 1U, 1U, 1U, 12U }, arm_compute::WeightFormat::OHWIo4 }),
+    // // Change N for OHWIo8
+    TestCaseType({ { 1U, 1U, 1U, 1U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 2U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 3U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 4U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 5U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 6U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 7U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 8U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 9U }, { 1U, 1U, 1U, 16U }, arm_compute::WeightFormat::OHWIo8 }),
+    // // Change N for OHWIo4 when H, W and C are not 1
+    TestCaseType({ { 3U, 4U, 2U, 1U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 2U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 3U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 4U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 5U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 6U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 7U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 8U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 9U }, { 3, 4, 2, 12 }, arm_compute::WeightFormat::OHWIo4 }),
+
+    // // Fix N and move HWI around, with different data layouts and formats
+    TestCaseType({ { 2U, 4U, 3U, 5U }, { 2, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 5U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 2U, 4U, 3U, 9U }, { 2, 4, 3, 16 }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 3U, 4U, 2U, 9U }, { 3, 4, 2, 16 }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1024U, 1U, 1U, 1001U }, { 1024, 1, 1, 1008 }, arm_compute::WeightFormat::OHWIo8 }),
+
+    // // Adding <block_by> on I (=C)
+    TestCaseType({ { 1U, 4U, 3U, 5U }, { 2, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4i2 }),
+    TestCaseType({ { 2U, 4U, 3U, 5U }, { 2, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4i2 }),
+    TestCaseType({ { 3U, 4U, 3U, 5U }, { 4, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4i2 }),
+
+    // ---------
+    TestCaseType({ { 2, 2, 1, 5 }, { 2, 2, 1, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1, 2, 2, 5 }, { 1, 2, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+
+});
+} // unnamed namespace
+
+DATA_TEST_CASE(PrepareWeightShape, framework::DatasetMode::ALL,
+               prepare_weights_shapes, shapes)
+{
+    const TensorShape               input_shape    = std::get<0>(shapes);
+    const TensorShape               expected_shape = std::get<1>(shapes);
+    const arm_compute::WeightFormat wf             = std::get<2>(shapes);
+    const DataType                  DT             = DataType::F32;
+    const DataLayout                DL             = DataLayout::NHWC;
+    const auto                      TI             = TensorInfo(input_shape, 1 /*num_channels, deprecated*/, DT, DL);
+    const TensorInfo                computed_info  = ::arm_compute::test::validation::prepare_weights(TI, wf);
+    ARM_COMPUTE_EXPECT_EQUAL(computed_info.tensor_shape(), expected_shape, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE_END() // VariableWeightUtils
+
+TEST_SUITE(ExperimentalCpuAPIVariableWeightWithFixtures)
+
+template <typename ScalarType>
+using VarWidth = VariableWeightsFixture<cpu::CpuGemmConv2d, Tensor, Accessor, ScalarType, /*enable_fast_math*/ false>;
+
+FIXTURE_DATA_TEST_CASE(RunSmallFloat, VarWidth<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+FIXTURE_DATA_TEST_CASE(RunSmallHalf, VarWidth<half>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F16 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, half(abs_tolerance_f16));
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+template <typename ScalarType>
+using VarWidthFastMath = VariableWeightsFixture<cpu::CpuGemmConv2d, Tensor, Accessor, ScalarType, /*enable_fast_math*/ true>;
+
+FIXTURE_DATA_TEST_CASE(RunSmallFloatFastMath, VarWidthFastMath<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+TEST_SUITE_END() // ExperimentalCpuAPIVariableWeightWithFixtures
+
+TEST_SUITE(ExperimentalNEAPIVariableWeightWithFixtures)
+
+template <typename ScalarType>
+using NEGEMMVarWidth = VariableWeightsFixtureNEInterface<NEGEMMConvolutionLayer, Tensor, Accessor, ScalarType, /*enable_fast_math*/ false>;
+
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallFloat, NEGEMMVarWidth<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallHalf, NEGEMMVarWidth<half>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F16 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, half(abs_tolerance_f16));
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+template <typename ScalarType>
+using NEGEMMVarWidthFastMath = VariableWeightsFixtureNEInterface<NEGEMMConvolutionLayer, Tensor, Accessor, ScalarType, /*enable_fast_math*/ true>;
+
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallFloatFastMath, NEGEMMVarWidthFastMath<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+TEST_SUITE_END() // ExperimentalNEAPIVariableWeightWithFixtures
+TEST_SUITE_END() // FIXED_FORMAT_KERNELS
+
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+
 TEST_SUITE(GEMMConvolutionLayer)
 template <typename T>
 using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+template <typename T>
+using NEGEMMConvolutionLayerPaddedWeightsFixture = ConvolutionValidationPaddedWeightsFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+template <typename T>
+using NEGEMMConvolutionLayerMixedDataLayoutFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T, true>;
+
+/** Test case for memory injection in @ref cpu::CpuGemmConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto        conv        = std::make_unique<cpu::CpuGemmConv2d>();
+    const auto  src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NCHW);
+    auto        dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  conv_info   = PadStrideInfo(1, 1, 0, 0, 2, 2, DimensionRoundingType::FLOOR);
+    WeightsInfo weights_info(false, 3U, 3U, 1U);
+    conv->configure(&src_info, &weight_info, &bias_info, &dst_info, conv_info, weights_info);
+
+    // tensors are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(conv->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        conv->prepare(prep_pack);
+        conv->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMMConvolutionLayer.
+ *
+ * Make sure @ref NEGEMMConvolutionLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto        conv        = std::make_unique<NEGEMMConvolutionLayer>();
+    const auto  src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NCHW);
+    auto        dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  conv_info   = PadStrideInfo(1, 1, 0, 0, 2, 2, DimensionRoundingType::FLOOR);
+    WeightsInfo weights_info(false, 3U, 3U, 1U);
+    auto        run_conv = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        conv->configure(&src, &weight, &bias, &dst, conv_info, weights_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        conv->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
 
 TEST_SUITE(Float)
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
+#if defined(ARM_COMPUTE_ENABLE_BF16)
 TEST_SUITE(BFLOAT16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                    framework::dataset::make("DataType", DataType::BFLOAT16)),
+                                                                                                                    framework::dataset::make("DataType", Scheduler::get().cpu_info().has_bf16() ? DataType::BFLOAT16 : DataType::F32)),
                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                                                                                             ActivationFunctionsDataset))
 {
@@ -398,7 +1177,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
 }
 TEST_SUITE_END() // BFLOAT16
-#endif           /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
+#endif           /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -424,11 +1203,59 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                           framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                           framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                       framework::dataset::make("Bias", TensorShape(2U))),
+                                                                               framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
+                                                                       framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                               framework::dataset::make("Dilation", Size2D(1, 1))),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::F32)),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                               ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+/** Padded weights
+ * CpuGemmConv2d uses two different paths for reshaping the weights based on if the weight tensor has holes (a common
+ * way to have "holes" in tensor is via extended paddings)
+ *
+ * We only need to test the padded weight path here on a single floating data type and a single layout, because the fallback path is agnostic of them
+ */
+FIXTURE_DATA_TEST_CASE(RunPaddedWeights, NEGEMMConvolutionLayerPaddedWeightsFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true }),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })
+                                                                                                            ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
+// This very large shape test is required to test heuristic paths where the tensor size is > 1e7 bytes
+// and weight dimensions larger than 7
+FIXTURE_DATA_TEST_CASE(RunVeryLarge, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::VeryLargeConvolutionLayerDataset(),
+        framework::dataset::make("ReshapeWeights", { true }),
+        framework::dataset::make("DataType", DataType::F32),
+        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+// TODO: COMPMID-6596 Extend quantized tests with at least one suite where the weight is padded (the legacy case, see floating point's RunPaddedWeights)
 template <typename T>
 using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+template <typename T>
+using NEGEMMConvolutionLayerQuantizedMixedDataLayoutFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T, true>;
 
 template <typename T>
 using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEConvolutionLayer, T, int8_t>;
@@ -440,17 +1267,39 @@ const auto QuantizedActivationFunctionsDataset = framework::dataset::make("Activ
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
 });
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, these two versions should be merged
+/// again, with the explicitly specified quantization info removed
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                                                                                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
                                                                                                                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                   framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                                   framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                               framework::dataset::make("Bias", TensorShape(2U))),
+                                                                                       framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
+                                                                               framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                                       framework::dataset::make("Dilation", Size2D(1, 1))),
+                                                               framework::dataset::make("ReshapeWeights", { true })),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
+                               QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
@@ -458,12 +1307,29 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<int8_t>,
                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
+                                                                                                                      framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(0.01f, -10) })),
                                                                                                                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                   framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                                   framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                               framework::dataset::make("Bias", TensorShape(2U))),
+                                                                                       framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
+                                                                               framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                                       framework::dataset::make("Dilation", Size2D(1, 1))),
+                                                               framework::dataset::make("ReshapeWeights", { true })),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
+                               QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM8_PER_CHANNEL)
@@ -491,6 +1357,27 @@ FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEGEMMConvolutionLayerQuantizedPerChannel
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+
+FIXTURE_DATA_TEST_CASE(MemoryStressLargeChannels, NEGEMMConvolutionLayerQuantizedPerChannelFixture<int8_t>,
+    framework::DatasetMode::ALL,
+        combine(
+            make("In", TensorShape(1U)),
+            make("Weights", TensorShape(1U, 1U, 1U, 17000U)),
+            make("Biases", TensorShape(17000U)),
+            make("Out", TensorShape(1U, 1U, 17000U)),
+            make("Info", PadStrideInfo(1, 1, 0, 0)),
+            make("Dilation", Size2D(1, 1)),
+            make("ReshapeWeights", { true }),
+            make("DataType", { DataType::QASYMM8_SIGNED }),
+            make("DataLayout", { DataLayout::NHWC }),
+            make("QuantizationInfo", QuantizationInfo(0.5f, 10)),
+            make("ActivationInfo", ActivationLayerInfo()),
+            make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
@@ -500,6 +1387,99 @@ TEST_SUITE(DirectGEMMConv2d)
 template <typename T>
 using NEDirectGEMMConv2dLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConv2d, T>;
 
+/** Test case for memory injection in @ref cpu::CpuGemmDirectConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       conv        = std::make_unique<cpu::CpuGemmDirectConv2d>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto conv_info   = Conv2dInfo{};
+    conv->configure(&src_info, &weight_info, &bias_info, &dst_info, conv_info);
+
+    // tensors are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(conv->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        conv->prepare(prep_pack);
+        conv->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMMConv2d.
+ *
+ * Make sure @ref NEGEMMConv2d still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       conv        = std::make_unique<NEGEMMConv2d>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto conv_info   = Conv2dInfo{};
+    auto       run_conv    = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        conv->configure(&src, &weight, &bias, &dst, conv_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        conv->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
@@ -573,7 +1553,7 @@ TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // DirectGEMMConv2d
 
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Copy.cpp b/tests/validation/NEON/Copy.cpp
index 7bcb00810f..e2e5210e89 100644
--- a/tests/validation/NEON/Copy.cpp
+++ b/tests/validation/NEON/Copy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,7 +92,7 @@ TEST_SUITE_END() // U16
 TEST_SUITE_END() // FixedSeed
 
 TEST_SUITE_END() // Copy
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/CropResize.cpp b/tests/validation/NEON/CropResize.cpp
index 298d393312..df7166bfdc 100644
--- a/tests/validation/NEON/CropResize.cpp
+++ b/tests/validation/NEON/CropResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -187,7 +187,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 TEST_SUITE_END() // S32
 
 TEST_SUITE_END() // CropResize
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DeconvolutionLayer.cpp b/tests/validation/NEON/DeconvolutionLayer.cpp
index 734e24fbea..b4c049f6f9 100644
--- a/tests/validation/NEON/DeconvolutionLayer.cpp
+++ b/tests/validation/NEON/DeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,54 +47,86 @@ constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f);    /**< Tolerance for
 constexpr AbsoluteTolerance<float> tolerance_quantized(1.0f); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 const RelativeTolerance<half_float::half> tolerance_fp16(half_float::half(0.2f)); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr float                           tolerance_num_fp16 = 0.02f;             /**< Tolerance number for FP16 tests -- follows a slightly stricter approach compared to ConvolutionLayer tests */
 #endif                                                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-constexpr float tolerance_num = 0.07f;                                            /**< Tolerance number */
+constexpr float tolerance_num_quant = 0.07f;                                      /**< Tolerance number for quantized types */
 
 const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 3)
-                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 2)
-                     * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data3x3_asymm = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadLeft", 0, 1)
-                           * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+                           * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
-const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorShape{ 10U, 10U, 1U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
+const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorShape
+{
+    10U, 10U, 1U, 1U
+})
+*framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY", 2) *framework::dataset::make("PadLeft", 3) *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop",
+        3)  *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
-const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape{ 640U, 360U, 56U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
+const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape
+{
+    640U, 360U, 56U, 1U
+})
+*framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY", 2) *framework::dataset::make("PadLeft", 3) *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop",
+        3)  *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
 const auto data3x3_precommit = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadX", 0, 2)
-                               * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
+                               * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
-                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
+
+const auto data5x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
+                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
-const auto data_layouts_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
+const auto data_layouts_dataset = framework::dataset::make("DataLayout",
+{
+    DataLayout::NCHW, DataLayout::NHWC
+});
 
-const auto add_bias_dataset = framework::dataset::make("AddBias", { true, false });
+const auto add_bias_dataset = framework::dataset::make("AddBias",
+{
+    true, false
+});
 
 const auto input_qinfo_dataset = framework::dataset::make("InputQInfo",
 {
     QuantizationInfo(1.f / 255.f, 0),
-    QuantizationInfo(2.f, 0),
+                     QuantizationInfo(2.f, 0),
 });
 
 const auto output_qinfo_dataset = framework::dataset::make("OutputQInfo",
 {
     QuantizationInfo(3.f / 255.f, 0),
-    QuantizationInfo(4.f, 0),
+                     QuantizationInfo(4.f, 0),
 });
+
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(DeconvolutionLayer)
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
@@ -104,6 +136,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),  // Invalid bias shape
                                             TensorInfo(TensorShape(13U, 11U, 4U, 3U), 1, DataType::F32), // Window shrink
                                             TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(2U,2U,1U,1U), 1, DataType::F32),    // Small shape no padding
+                                            TensorInfo(TensorShape(3U,26U,26U,1U), 1, DataType::F32),    // Negative padding
                                           }),
     framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
@@ -111,6 +145,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(3U, 2U, 2U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(3U, 3U, 4U), 1, DataType::F32),
                                               TensorInfo(TensorShape(1U, 1U, 2U, 4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(3U,3U,1U,1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U,1U,26U,88U), 1, DataType::F32),
                                           })),
     framework::dataset::make("BiasInfo",  { TensorInfo(TensorShape(1U), 1, DataType::F16),
                                             TensorInfo(TensorShape(1U), 1, DataType::F32),
@@ -118,6 +154,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(25U, 11U), 1, DataType::F32),
                                             TensorInfo(TensorShape(1U), 1, DataType::F32),
                                             TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(88U), 1, DataType::F32),
                                           })),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32),
@@ -125,6 +163,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(13U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(11U, 9U, 1U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(32U, 16U, 4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(4U,4U,1U,1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U,78U,88U,1U), 1, DataType::F32),
                                           })),
     framework::dataset::make("PadStrideInfo", { PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 0, 0),
@@ -132,8 +172,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                 PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 1, 1),
                                                 PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(2, 3, 3, 1),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, false, false, true,true, false })),
     input_info, weights_info, bias_info, output_info, pad_info, expected)
 {
     bool is_valid = bool(NEDeconvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pad_info));
@@ -157,6 +199,9 @@ using NEDeconvolutionLayerAsymmFixture9x9 = DeconvolutionValidationAsymmFixture<
 template <typename T>
 using NEDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 1, 1>;
 
+template <typename T>
+using NEDeconvolutionLayerFixture5x1 = DeconvolutionValidationFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 5, 1>;
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 TEST_SUITE(W4x4)
@@ -220,6 +265,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerAsymmFixture9x9<float>, fra
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
 TEST_SUITE_END() // W9x9
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture5x1<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                    data_layouts_dataset),
+                                                                                                            add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // W5x1
 TEST_SUITE_END() // FP32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -230,7 +284,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture4x4<half>, framework::Dat
                                                                                                            add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 TEST_SUITE_END() // W4x4
 TEST_SUITE(W3x3)
@@ -240,14 +294,14 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerFixture3x3<half>, framework
                                                                                                                   add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data3x3, framework::dataset::make("DataType", DataType::F16)),
                                                                                                                         data_layouts_dataset),
                                                                                                                 add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 TEST_SUITE_END() // W3x3
 TEST_SUITE(W1x1)
@@ -256,9 +310,18 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture1x1<half>, framework::Dat
                                                                                                            add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 TEST_SUITE_END() // W1x1
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture5x1<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                   data_layouts_dataset),
+                                                                                                           add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
+}
+TEST_SUITE_END() // W5x1
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
@@ -273,6 +336,21 @@ using NEDeconvolutionLayerQuantizedFixture3x3 = DeconvolutionValidationQuantized
 template <typename T>
 using NEDeconvolutionLayerQuantizedFixture1x1 = DeconvolutionValidationQuantizedFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 1, 1>;
 
+template <typename T>
+using NEDeconvolutionLayerQuantizedFixture5x1 = DeconvolutionValidationQuantizedFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 5, 1>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture4x4 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 4, 4>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture3x3 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 3, 3>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture1x1 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 1, 1>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture5x1 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 5, 1>;
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 
@@ -285,7 +363,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture4x4<uint8_t>, fr
                                                                                                                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W4x4
 
@@ -299,7 +377,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerQuantizedFixture3x3<uint8_t
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
                        framework::dataset::make("DataType",
@@ -310,7 +388,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<uint8_t
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W3x3
 
@@ -323,10 +401,23 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture1x1<uint8_t>, fr
                                                                                                                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W1x1
 
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture5x1<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data5x1, framework::dataset::make("DataType",
+                                                                                                                       DataType::QASYMM8)),
+                                                                                                                       data_layouts_dataset),
+                                                                                                                       input_qinfo_dataset),
+                                                                                                                       output_qinfo_dataset),
+                                                                                                                       add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
@@ -340,7 +431,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture4x4<int8_t>, fra
                                                                                                                       add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W4x4
 
@@ -354,7 +445,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerQuantizedFixture3x3<int8_t>
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
                        framework::dataset::make("DataType",
@@ -365,28 +456,164 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<int8_t>
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(W1x1)
-FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture1x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1, framework::dataset::make("DataType",
-                                                                                                                      DataType::QASYMM8_SIGNED)),
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture1x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1,
+                                                                                                                      framework::dataset::make("DataType",
+                                                                                                                              DataType::QASYMM8_SIGNED)),
                                                                                                                       data_layouts_dataset),
                                                                                                                       input_qinfo_dataset),
                                                                                                                       output_qinfo_dataset),
                                                                                                                       add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W1x1
 
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture5x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data5x1, framework::dataset::make("DataType",
+                                                                                                                      DataType::QASYMM8_SIGNED)),
+                                                                                                                      data_layouts_dataset),
+                                                                                                                      input_qinfo_dataset),
+                                                                                                                      output_qinfo_dataset),
+                                                                                                                      add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // QASYMM8_SIGNED
+
+const auto input_qinfo_per_channel_dataset = framework::dataset::make("InputQuantizationInfo",
+{
+    QuantizationInfo(1.f / 255.f, 10)
+});
+const auto output_qinfo_per_channel_dataset = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(3.f / 255.f, 0)
+});
+const auto input_signed_qinfo_per_channel_dataset = framework::dataset::make("InputQuantizationInfo",
+{
+    QuantizationInfo(1.f / 255.f, -10)
+});
+const auto output_signed_qinfo_per_channel_dataset = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(3.f / 255.f, 10)
+});
+
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+
+TEST_SUITE(W4x4)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture4x4<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data4x4,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture4x4<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data4x4,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W4x4
+
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture3x3<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data3x3,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture3x3<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data3x3,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W3x3
+
+TEST_SUITE(W1x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture1x1<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data1x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture1x1<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data1x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W1x1
+
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture5x1<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data5x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture5x1<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data5x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W5x1
+
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
+
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // DeconvolutionLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DepthConvertLayer.cpp b/tests/validation/NEON/DepthConvertLayer.cpp
index 8b581a2ce7..4972708144 100644
--- a/tests/validation/NEON/DepthConvertLayer.cpp
+++ b/tests/validation/NEON/DepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,25 +56,21 @@ const auto DepthConvertLayerU16toU8Dataset      = combine(framework::dataset::ma
 const auto DepthConvertLayerU16toU32Dataset     = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32));
 const auto DepthConvertLayerS16toU8Dataset      = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8));
 const auto DepthConvertLayerS16toS32Dataset     = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32));
-const auto DepthConvertLayerBF16toF32Dataset    = combine(framework::dataset::make("DataType", DataType::BFLOAT16), framework::dataset::make("DataType", DataType::F32));
 const auto DepthConvertLayerF16toU8Dataset      = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U8));
 const auto DepthConvertLayerF16toF32Dataset     = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
 const auto DepthConvertLayerF16toS32Dataset     = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S32));
 const auto DepthConvertLayerF32toF16Dataset     = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
 const auto DepthConvertLayerF32toS32Dataset     = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32));
 const auto DepthConvertLayerF32toU8Dataset      = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U8));
-const auto DepthConvertLayerF32toBF16Dataset    = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::BFLOAT16));
 
 const auto DepthConvertLayerS32toF32Dataset     = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F32));
 const auto DepthConvertLayerS32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::QASYMM8));
 const auto DepthConvertLayerS32toF16Dataset     = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F16));
 const auto DepthConvertLayerS32toU8Dataset      = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U8));
 
-const auto DepthConvertLayerF16toQASYMM8Dataset   = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8));
-const auto DepthConvertLayerF32toQASYMM8Dataset   = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
-const auto DepthConvertLayerShiftDatasetNightly   = framework::dataset::make("Shift", 0, 7);
-const auto DepthConvertLayerShiftDatasetPrecommit = framework::dataset::make("Shift", { 0, 3, 6 });
-const auto DepthConvertLayerZeroShiftDataset      = framework::dataset::make("Shift", 0);
+const auto DepthConvertLayerF16toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8));
+const auto DepthConvertLayerF32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
+const auto DepthConvertLayerZeroShiftDataset    = framework::dataset::make("Shift", 0);
 
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int32_t> tolerance_one_int32(1);
@@ -108,7 +104,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::WRAP,
                                                      })),
-               framework::dataset::make("Shift",{ 1, 1, 1, 1, 1, 1, 8, 1,
+               framework::dataset::make("Shift",{ 0, 0, 0, 1, 1, 1, 8, 1,
                                                      })),
                framework::dataset::make("Expected", { false, false, false, false, true})),
                input_info, output_info, policy, shift, expected)
@@ -129,8 +125,6 @@ using NEDepthConvertLayerToU8Fixture = DepthConvertLayerValidationFixture<Tensor
 template <typename T>
 using NEDepthConvertLayerToU32Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, uint32_t>;
 template <typename T>
-using NEDepthConvertLayerToBF16Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, bfloat16>;
-template <typename T>
 using NEDepthConvertLayerToF16Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, half>;
 template <typename T>
 using NEDepthConvertLayerToF32Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, float>;
@@ -188,7 +182,7 @@ TEST_SUITE_END() // QASYMM8_to_S32
 TEST_SUITE(U8_to_U16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toU16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -196,7 +190,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toU16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -206,7 +200,7 @@ TEST_SUITE_END() // U8_to_U16
 TEST_SUITE(U8_to_S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -214,7 +208,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToS16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toS16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -223,7 +217,7 @@ TEST_SUITE_END() // U8_to_S16
 TEST_SUITE(U8_to_S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS32Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -231,7 +225,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS32Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToS32Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toS32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -241,7 +235,7 @@ TEST_SUITE_END() // U8_to_S32
 TEST_SUITE(U8_to_F32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF32Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toF32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -249,7 +243,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF32Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToF32Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toF32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -260,7 +254,7 @@ TEST_SUITE_END() // U8_to_F32
 TEST_SUITE(U8_to_F16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toF16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -268,7 +262,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToF16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toF16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -279,14 +273,14 @@ TEST_SUITE_END() // U8_to_F36
 TEST_SUITE(U16_to_U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU8Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU8Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU8Fixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU16toU8Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -296,14 +290,14 @@ TEST_SUITE_END() // U16_to_U8
 TEST_SUITE(U16_to_U32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU32Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU32Dataset),
                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                       DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                       DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU32Fixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU16toU32Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerShiftDatasetNightly))
+                                                                                                                     DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -313,14 +307,14 @@ TEST_SUITE_END() // U16_to_U32
 TEST_SUITE(S16_to_U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU8Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toU8Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                     DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU8Fixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerS16toU8Dataset),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                   DepthConvertLayerShiftDatasetNightly))
+                                                                                                                   DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -330,42 +324,20 @@ TEST_SUITE_END() // S16_to_U8
 TEST_SUITE(S16_to_S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS32Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToS32Fixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerS16toS32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S16_to_S32
 
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-TEST_SUITE(BFLOAT16_to_F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF32Fixture<bfloat16>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerBF16toF32Dataset),
-                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                       DepthConvertLayerZeroShiftDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // BFLOAT16_to_F32
-
-TEST_SUITE(F32_to_BFLOAT16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToBF16Fixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerF32toBF16Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerZeroShiftDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // F32_to_BFLOAT16
-#endif           /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16_to_QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToQASYMM8Fixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
@@ -608,7 +580,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU8Fixture<int32_t>, framew
 TEST_SUITE_END() // S32_to_U8
 
 TEST_SUITE_END() // DepthConvertLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DepthToSpaceLayer.cpp b/tests/validation/NEON/DepthToSpaceLayer.cpp
index d93437e36c..46ac59e95c 100644
--- a/tests/validation/NEON/DepthToSpaceLayer.cpp
+++ b/tests/validation/NEON/DepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,7 +109,7 @@ TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE_END() // DepthToSpace
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index e255fc7b4d..e9609b7b72 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,32 +42,70 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-constexpr RelativeTolerance<float>   tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+constexpr RelativeTolerance<float>   tolerance_f32(0.01f);        /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);        /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8_SIGNED */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.02)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr float                     tolerance_num = 0.05f;                 /**< Tolerance number */
 #endif                                                                     // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", { 1, 2, 5 });
-const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 8 });
+const auto depth_multipliers       = make("DepthMultiplier", { 1, 2, 8 });
+const auto large_depth_multipliers = make("DepthMultiplier", { 5, 32 });
 
-//Activation Functions
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+// Activation Functions
+const auto NoActivation = make("ActivationInfo", ActivationLayerInfo());
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
 });
 
-const auto input_qinfo_dataset = framework::dataset::make("InputQInfo",
+const auto ActivationFunctionsDatasetNightly = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+#ifdef __aarch64__
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU),
+#endif // __aarch64__
+});
+
+const auto ActivationFunctionsQuantizedSmallDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+});
+
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+});
+
+// This is only used when there is fused activation
+const auto input_qinfo_dataset = make("InputQInfo",
 {
     QuantizationInfo(0.3f, 10),
     QuantizationInfo(2.2f, 10),
 });
+
+const auto IgnoredQuantizationInfo = make("IgnoredQuantizationInfo", QuantizationInfo());
+
 } // namespace
 
 TEST_SUITE(NEON)
@@ -76,7 +114,7 @@ TEST_SUITE(DepthwiseConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
+               make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                        TensorInfo(TensorShape(32U, 18U, 3U), 1, DataType::F32),     // Mismatching input feature maps
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Unsupported weights dimensions
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching depth multiplier
@@ -88,7 +126,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // dilation < 1
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                      }),
-               framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
+               make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(5U, 5U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
@@ -100,7 +138,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                        })),
-               framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+               make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
@@ -112,7 +150,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+               make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
@@ -124,7 +162,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+               make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
@@ -136,7 +174,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                      })),
-               framework::dataset::make("DepthMultiplier", { 1,
+               make("DepthMultiplier", { 1,
                                                              1,
                                                              1,
                                                              3,
@@ -148,7 +186,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                              1,
                                                              1,
                                                             })),
-               framework::dataset::make("Dilation", { Size2D(1U, 1U),
+               make("Dilation", { Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
@@ -160,7 +198,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                       Size2D(0U, 1U),
                                                       Size2D(1U, 1U),
                                                             })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
+               make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
                input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
     bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false),
@@ -169,7 +207,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
 }
 
 DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
+                make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                         TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),     // Mismatching input feature maps
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching depth multiplier
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid biases size
@@ -178,7 +216,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Patch size bigger than input width
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Dilation < 1
                                                       }),
-                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
+                make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
@@ -187,7 +225,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
                                                         })),
-                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+                make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(4U), 1, DataType::F32),
@@ -196,7 +234,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                        })),
-                framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
@@ -205,7 +243,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
                                                        })),
-                framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
@@ -214,7 +252,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                       })),
-                framework::dataset::make("DepthMultiplier", { 1,
+                make("DepthMultiplier", { 1,
                                                               1,
                                                               3,
                                                               1,
@@ -223,7 +261,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                               2,
                                                               2,
                                                              })),
-                framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                make("Dilation", { Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
@@ -232,7 +270,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        Size2D(25U, 1U),
                                                        Size2D(0U, 1U),
                                                              })),
-                framework::dataset::make("Expected", { false, false, false, false, false, false, false, false})),
+                make("Expected", { false, false, false, false, false, false, false, false})),
                 input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
     bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
@@ -242,45 +280,71 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
 // *INDENT-ON*
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
+template <typename T>
+using NEDepthwiseConvolutionLayerMixedDataLayoutFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T, true>;
+template <typename T>
+using NEDepthwiseConvolutionLayerVariableWeightsFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T, false, false, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(F32)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::F32),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsDatasetNightly))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                                                                       depth_multipliers),
-                                                                                                                       framework::dataset::make("DataType",
-                                                                                                                               DataType::F32)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                           depth_multipliers),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
-                                                                                                                     large_depth_multipliers),
-                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, NEDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                           make("DepthMultiplier", { 2 })),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                           large_depth_multipliers),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NHWC })),
+                           make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                                                                       depth_multipliers),
-                                                                                                                       framework::dataset::make("DataType",
-                                                                                                                               DataType::F32)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                                           depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                                                                     large_depth_multipliers),
-                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -288,43 +352,43 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                       depth_multipliers),
-                                                                                                                       framework::dataset::make("DataType",
-                                                                                                                               DataType::F32)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                           depth_multipliers),
+                           make("DataType",
+                                                    DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                       large_depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                       depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                           depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                       large_depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -333,33 +397,72 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsSmall3x3, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall5x5, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout3x3, NEDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType", DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsSmall5x5, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsLarge3x3, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -368,42 +471,57 @@ TEST_SUITE_END() // F32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::F16),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsDatasetNightly))
+{
+    validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                                                                      depth_multipliers),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::F16)),
-                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                           depth_multipliers),
+                           make("DataType",
+                                                    DataType::F16)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
-                                                                                                                    large_depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                        large_depth_multipliers),
+                                                                                                                        make("DataType",
+                                                                                                                                DataType::F16)),
+                                                                                                                        make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                                                                      depth_multipliers),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::F16)),
-                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                                           depth_multipliers),
+                                                   make("DataType", DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                                                                    large_depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                                           large_depth_multipliers),
+                                                   make("DataType", DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
@@ -413,45 +531,45 @@ TEST_SUITE_END() // Generic
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                      depth_multipliers),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::F16)),
-                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                           depth_multipliers),
+                           make("DataType",
+                                                    DataType::F16)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                       large_depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
 
 TEST_SUITE(Dilation)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                       depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                           depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                       large_depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -460,33 +578,33 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmallW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunSmallW5x5, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallW5x5, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLargeW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLargeW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -497,94 +615,162 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 template <typename T>
-using NEDepthwiseConvolutionLayerQuantizedFixtureOptimized = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
-template <typename T>
 using NEDepthwiseConvolutionLayerQuantizedFixture                    = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
+template <typename T>
+using NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture     = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T, true>;
 using NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture = DepthwiseConvolutionLayerValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, uint8_t, int8_t>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                       depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        make("DepthMultiplier", { 2 }),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                       depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
 TEST_SUITE(Dilation)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -592,39 +778,69 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout3x3, NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -632,170 +848,269 @@ TEST_SUITE_END() // Optimized
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                       depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                       depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // Optimized
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM8_PER_CHANNEL)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("InputDataType", DataType::QASYMM8),
+        make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                               depth_multipliers),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                                                   depth_multipliers),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                               depth_multipliers),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                                                                   depth_multipliers),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                               depth_multipliers),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                                                                   depth_multipliers),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -803,27 +1118,27 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                               framework::dataset::make("DepthMultiplier", 1)),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                                                   make("DepthMultiplier", 1)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                               framework::dataset::make("DepthMultiplier", 1)),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                                                   make("DepthMultiplier", 1)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -832,7 +1147,7 @@ TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // DepthwiseConvLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
index d379ce728e..221fc5d249 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/framework/Macros.h"
@@ -37,12 +38,12 @@ namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
 
-// Create function for NEDepthwiseConvolutionLayerKernel
-using NEDepthwiseConvolutionLayerNative = NESynthetizeFunctionWithZeroConstantKernelBorder<NEDepthwiseConvolutionLayerNativeKernel>;
+// Create function for CpuDepthwiseConvolutionKernel
+using CpuDepthwiseConvolutionNative = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuDepthwiseConv2dNativeKernel>;
 
 // Fixture for NEDepthwiseConvolutionLayerKernel
 template <typename T>
-using NEDepthwiseConvolutionLayerNativeFixture = DepthwiseConvolutionLayerNativeValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerNative, T>;
+using CpuDepthwiseConvolutionNativeFixture = DepthwiseConvolutionLayerNativeValidationFixture<Tensor, Accessor, CpuDepthwiseConvolutionNative, T>;
 
 namespace
 {
@@ -124,8 +125,9 @@ TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
     auto biases  = create_tensor<Tensor>(bias_shape, data_type, 1, QuantizationInfo(), data_layout);
     auto dst     = create_tensor<Tensor>(TensorShape(), data_type, 1, QuantizationInfo(), data_layout);
 
-    NEDepthwiseConvolutionLayerNativeKernel dwc;
-    dwc.configure(&src, &weights, &biases, &dst, pad_stride_info);
+    cpu::kernels::CpuDepthwiseConv2dNativeKernel dwc;
+    const ConvolutionInfo info{pad_stride_info, 1, ActivationLayerInfo(), Size2D(1, 1)};
+    dwc.configure(src.info(), weights.info(), biases.info(), dst.info(), info);
 
     ARM_COMPUTE_EXPECT(src.info()->padding().empty(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(weights.info()->padding().empty(), framework::LogLevel::ERRORS);
@@ -133,9 +135,47 @@ TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
     ARM_COMPUTE_EXPECT(dst.info()->padding().empty(), framework::LogLevel::ERRORS);
 }
 
+TEST_SUITE(KERNEL_SELECTION)
+DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL,
+               combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QASYMM8,
+                                                              DataType::QSYMM8_PER_CHANNEL
+                                                            })),
+                       framework::dataset::make("DataType_per_channel", { DataType::QASYMM8,
+                                                                          DataType::QASYMM8_SIGNED
+                                                            })),
+                cpu_ext, data_type, data_type_per_channel)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuDepthwiseConv2dNativeKernel::get_implementation(
+        DepthwiseConv2dNativeDataTypeISASelectorData{ data_type, data_type_per_channel,cpu_isa },
+        cpu::KernelSelectionType::Preferred );
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string per_channel_str = "_";
+    if (data_type == DataType::QSYMM8_PER_CHANNEL)
+    {
+        per_channel_str = "_" + cpu_impl_dt(data_type_per_channel) + "_" ;
+    }
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type)  + per_channel_str + "deptwiseconv2dnative";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // KERNEL_SELECTION
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CpuDepthwiseConvolutionNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_precommit,
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
@@ -152,7 +192,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<float>
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CpuDepthwiseConvolutionNativeFixture<float>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_nightly,
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
@@ -172,7 +212,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerNativeFixture<float>
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // DepthwiseConvolutionLayerNative
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/NEON/DequantizationLayer.cpp b/tests/validation/NEON/DequantizationLayer.cpp
index bce60c7891..82a2149152 100644
--- a/tests/validation/NEON/DequantizationLayer.cpp
+++ b/tests/validation/NEON/DequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -149,7 +149,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDequantizationLayerFixture<float>, framework:
 TEST_SUITE_END() // FP32
 
 TEST_SUITE_END() // DequantizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Derivative.cpp b/tests/validation/NEON/Derivative.cpp
deleted file mode 100644
index aae41e4f70..0000000000
--- a/tests/validation/NEON/Derivative.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/GradientDimensionDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/DerivativeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Derivative)
-
-using NEDerivativeFixture = DerivativeValidationFixture<Tensor, Accessor, NEDerivative, uint8_t, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDerivativeFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDerivativeFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-TEST_SUITE_END() // Derivative
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/DetectionPostProcessLayer.cpp b/tests/validation/NEON/DetectionPostProcessLayer.cpp
index 80b1856e5f..7d725327b7 100644
--- a/tests/validation/NEON/DetectionPostProcessLayer.cpp
+++ b/tests/validation/NEON/DetectionPostProcessLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -150,7 +150,7 @@ inline void base_test_case(DetectionPostProcessLayerInfo info, DataType data_typ
         quantize_and_fill_tensor(Accessor(anchors), anchors_vector);
     }
 
-    // Determine the output through the NEON kernel
+    // Determine the output through the Compute Library operator
     Tensor                      output_boxes;
     Tensor                      output_classes;
     Tensor                      output_scores;
@@ -384,7 +384,7 @@ TEST_CASE(Quantized_regular, framework::DatasetMode::ALL)
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE_END() // DetectionPostProcessLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Dilate.cpp b/tests/validation/NEON/Dilate.cpp
deleted file mode 100644
index 3c096f7ce6..0000000000
--- a/tests/validation/NEON/Dilate.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEDilate.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/DilateFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Dilate)
-
-template <typename T>
-using NEDilateFixture = DilateValidationFixture<Tensor, Accessor, NEDilate, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDilateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDilateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END() // Dilate
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/DilatedConvolutionLayer.cpp b/tests/validation/NEON/DilatedConvolutionLayer.cpp
index cf2f5f2ea4..fbfe8b8a7a 100644
--- a/tests/validation/NEON/DilatedConvolutionLayer.cpp
+++ b/tests/validation/NEON/DilatedConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/cpu/operators/CpuConv2d.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/DilatedConvolutionLayerDataset.h"
@@ -49,7 +50,7 @@ const AbsoluteTolerance<float>            abs_tolerance_f16(0.3f);
 const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr float                           tolerance_num_f16 = 0.07f;                 /**< Tolerance number for FP16 */
 #endif                                                                               /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8(1);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
@@ -96,7 +97,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                           framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
                input_info, weights_info, output_info, conv_info, dilation, expected)
 {
-    ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
+    ConvolutionMethod is_valid = cpu::CpuConv2d::get_convolution_method(&input_info.clone()->set_is_resizable(false),
                                                                             &weights_info.clone()->set_is_resizable(false),
                                                                             &output_info.clone()->set_is_resizable(false),
                                                                             conv_info, WeightsInfo(), dilation);
@@ -161,13 +162,18 @@ template <typename T>
 using NEGEMMDilatedConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
 
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, we can remove the explicit
+/// quantization info.
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
 {
     // Validate output
@@ -178,7 +184,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMDilatedConvolutionLayerQuantizedFixture<u
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
 {
     // Validate output
@@ -188,7 +194,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // GEMMDilatedConvolutionLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index afd9e3952f..0779c9d388 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,12 @@
  */
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -70,8 +73,8 @@ const auto data_pad_f16 = concat(combine(framework::dataset::make("PadX", { 0, 1
                                                  framework::dataset::make("KernelSize", 1))));
 
 const auto data_f32 = combine(datasets::SmallDirectConvolutionShapes(),
-                              combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
-                                      combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
+                              combine(framework::dataset::make("StrideX", { 1, 2, 3, 4 }),
+                                      combine(framework::dataset::make("StrideY", { 1, 2, 3, 4 }),
                                               data_pad_f32)));
 
 const auto data_f16 = combine(datasets::SmallDirectConvolutionShapes(),
@@ -87,17 +90,25 @@ const auto data_prec = combine(datasets::SmallDirectConvolutionShapes(),
                                                                framework::dataset::make("KernelSize", 3))))));
 
 const auto data9x9 = combine(datasets::SmallDirectConvolutionShapes(),
-                             combine(framework::dataset::make("StrideX", { 1 }),
-                                     combine(framework::dataset::make("StrideY", { 1 }),
+                             combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+                                     combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
                                              combine(framework::dataset::make("PadX", { 0, 2 }),
                                                      combine(framework::dataset::make("PadY", { 0, 3 }),
                                                              framework::dataset::make("KernelSize", 9))))));
 
-const auto data_f32_nightly = combine(data_f32, framework::dataset::make("NumKernels", { 1, 4 }));
-const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKernels", { 1, 4 }));
+const auto data8x8 = combine(datasets::SmallDirectConvolutionShapes(),
+                             combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+                                     combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
+                                             combine(framework::dataset::make("PadX", { 0 }),
+                                                     combine(framework::dataset::make("PadY", { 0 }),
+                                                             framework::dataset::make("KernelSize", 8))))));
+
+const auto data_f32_nightly = combine(data_f32, framework::dataset::make("NumKernels", { 1, 4, 5 }));
+const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKernels", { 1, 4, 5 }));
 
 const auto data_precommit    = combine(data_prec, framework::dataset::make("NumKernels", { 1 }));
 const auto data_precommit9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 4 }));
+const auto data_precommit8x8 = combine(data8x8, framework::dataset::make("NumKernels", { 4 }));
 
 /* The following tests is from real use-case that made DirectConvolution
  * overflows in terms of its tensor indexing. This test case is using
@@ -129,17 +140,95 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
 TEST_SUITE(NEON)
 TEST_SUITE(DirectConvolutionLayer)
 
+/** Check whether the configuration of a Direct Convolution layer with no
+ * bias leads to a successful execution.
+ */
+TEST_CASE(NoBias, framework::DatasetMode::PRECOMMIT)
+{
+    const auto     src_shape     = TensorShape(27U, 13U, 2U);
+    const auto     weights_shape = TensorShape(3U, 3U, 2U, 4U);
+    const auto     bias_shape    = TensorShape(4U);
+    const auto     dst_shape     = TensorShape(25U, 11U, 4U);
+    constexpr auto dt            = DataType::F32;
+
+    auto src     = create_tensor<Tensor>(src_shape, dt);
+    auto weights = create_tensor<Tensor>(weights_shape, dt);
+    auto dst     = create_tensor<Tensor>(dst_shape, dt);
+
+    const auto conv_info = PadStrideInfo(1, 1, 0, 0);
+
+    // Create Direct Convolution function
+    NEDirectConvolutionLayer conv{};
+    conv.configure(&src, &weights, nullptr, &dst, conv_info);
+
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    library->fill_tensor_value(Accessor(src), 1.f);
+    library->fill_tensor_value(Accessor(weights), 1.f);
+
+    conv.run();
+
+    // Compute reference to compare
+    SimpleTensor<float> ref_src{ src_shape, dt };
+    SimpleTensor<float> ref_weights{ weights_shape, dt };
+    SimpleTensor<float> ref_bias{ bias_shape, dt };
+    library->fill_tensor_value(ref_src, 1.f);
+    library->fill_tensor_value(ref_weights, 1.f);
+    // No bias
+    library->fill_tensor_value(ref_bias, 0.f);
+    auto ref_dst = reference::convolution_layer<float>(ref_src, ref_weights, ref_bias, dst_shape, conv_info);
+
+    validate(Accessor(dst), ref_dst);
+}
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               concat(combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                      framework::dataset::make("DataType", { DataType::F32 })),
+                              framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                      combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                      framework::dataset::make("DataType", { DataType::F16 })),
+                              framework::dataset::make("DataLayout", { DataLayout::NCHW }))),
+               cpu_ext, data_type, data_layout)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ data_type, data_layout, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string data_layout_str;
+    if(data_layout == DataLayout::NCHW)
+    {
+        data_layout_str = "nchw";
+    }
+    else
+    {
+        data_layout_str = "nhwc";
+    }
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_" + data_layout_str + "_directconv2d";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching data type input/weights
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching input feature maps
+        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid: Mismatching data type input/weights
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid: Mismatching input feature maps
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported kernel width
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Non-rectangular weights dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported non-rectangular weights dimensions
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid weights dimensions
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid stride
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid biases size
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid biases dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported stride
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported biases size
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported biases dimensions
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid output size
                                               }),
         framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F16),
@@ -185,7 +274,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
                                                        framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
 })),
         framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false })),
         input_info, weights_info, biases_info, output_info, conv_info, act_info, expected)
@@ -235,6 +331,8 @@ DATA_TEST_CASE(NoPaddingNHWCKernel, framework::DatasetMode::ALL, combine(combine
 
 template <typename T>
 using NEDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<Tensor, Accessor, NEDirectConvolutionLayer, T>;
+template <typename T>
+using NEDirectConvolutionLayerMixedDataLayoutFixture = DirectConvolutionValidationFixture<Tensor, Accessor, NEDirectConvolutionLayer, T, true>;
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -266,6 +364,24 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolutionLayerFixture<float>, framewo
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEDirectConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::F32)),
+                       ActivationFunctionsDataset),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall8x8, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit8x8, framework::dataset::make("DataType",
+                                                                                                                       DataType::F32)),
+                                                                                                                       ActivationFunctionsDataset),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall9x9, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit9x9, framework::dataset::make("DataType",
                                                                                                                        DataType::F32)),
                                                                                                                        ActivationFunctionsDataset),
@@ -293,7 +409,7 @@ FIXTURE_DATA_TEST_CASE(RunLargeUsecase, NEDirectConvolutionLayerFixture<float>,
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // DirectConvolutionLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
index f135ba9913..0667ac73f9 100644
--- a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
+++ b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,13 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
 } // namespace
 
 TEST_SUITE(NEON)
@@ -53,6 +60,9 @@ TEST_SUITE(AbsLayer)
 template <typename T>
 using NEAbsLayerFixture = AbsValidationFixture<Tensor, Accessor, NEAbsLayer, T>;
 
+template <typename T>
+using NEAbsLayerQuantizedFixture = AbsQuantizedValidationFixture<Tensor, Accessor, NEAbsLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -107,8 +117,34 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEAbsLayerFixture<int32_t>, framework::DatasetM
 TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.075, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // AbsLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseDivision.cpp b/tests/validation/NEON/ElementwiseDivision.cpp
index db34af1c05..95db4ad5fd 100644
--- a/tests/validation/NEON/ElementwiseDivision.cpp
+++ b/tests/validation/NEON/ElementwiseDivision.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,8 @@ namespace validation
 namespace
 {
 RelativeTolerance<float> tolerance_fp32(0.000001f);
+AbsoluteTolerance<int>   tolerance_zero_s32(0); // Tolerance for S32 division
+
 /** Input data sets **/
 const auto ElementwiseDivisionS32Dataset = combine(combine(framework::dataset::make("DataType", DataType::S32),
                                                            framework::dataset::make("DataType", DataType::S32)),
@@ -54,6 +56,8 @@ const auto              ElementwiseDivisionFP16Dataset = combine(combine(framewo
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseDivisionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                     framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -91,10 +95,41 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+// Test test cases will execute the function with dynamic-stated shapes
+// Since other elementwise operations share the same kernel, this tests are added only here.
+// Also, only FP32 is tested since data type doesn't/shouldn't matter with dynamic shapes.
+TEST_SUITE(DynamicShape)
+template <typename T>
+using CpuElementwiseDivisionDynamicShapeFixture = ArithmeticDivisionDynamicShapeValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
+
+template <typename T>
+using CpuElementwiseDivisionBroadcastDynamicShapeFixture = ArithmeticDivisionBroadcastDynamicShapeValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
+
+TEST_SUITE(F32)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuElementwiseDivisionDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset),
+                                                                                                                        InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CpuElementwiseDivisionBroadcastDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseDivisionFP32Dataset),
+                       OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+
+TEST_SUITE_END() // F32
+TEST_SUITE_END() // DynamicShape
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionFP16Dataset),
+                                                                                                          InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16, 0.01);
@@ -103,7 +138,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset),
+                                                                                                           InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -112,8 +148,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework:
 template <typename T>
 using NEElementwiseDivisionBroadcastFixture = ArithmeticDivisionBroadcastValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseDivisionBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseDivisionFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseDivisionBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseDivisionFP32Dataset),
+                       OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseDivisionBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwiseDivisionFP32Dataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -123,10 +167,11 @@ TEST_SUITE_END() // Float
 
 TEST_SUITE(Integer)
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<int32_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionS32Dataset),
+                                                                                                             InPlaceDataSet))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_zero_s32);
 }
 TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
diff --git a/tests/validation/NEON/ElementwiseExpLayer.cpp b/tests/validation/NEON/ElementwiseExpLayer.cpp
index 3168b9ffe2..31cd78626f 100644
--- a/tests/validation/NEON/ElementwiseExpLayer.cpp
+++ b/tests/validation/NEON/ElementwiseExpLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,15 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
+
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(ExpLayer)
@@ -53,6 +62,9 @@ TEST_SUITE(ExpLayer)
 template <typename T>
 using NEExpLayerFixture = ExpValidationFixture<Tensor, Accessor, NEExpLayer, T>;
 
+template <typename T>
+using NEExpLayerQuantizedFixture = ExpQuantizedValidationFixture<Tensor, Accessor, NEExpLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -82,8 +94,34 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEExpLayerFixture<float>, framework::DatasetMod
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEExpLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.01, 0) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.003, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEExpLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.02, -1) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.002, -2) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // ExpLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseKernelSelection.cpp b/tests/validation/NEON/ElementwiseKernelSelection.cpp
new file mode 100644
index 0000000000..7990a51936
--- /dev/null
+++ b/tests/validation/NEON/ElementwiseKernelSelection.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuElementwiseKernel.h"
+#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(KernelSelection)
+
+DATA_TEST_CASE(KernelSelection_elementwise_unary, framework::DatasetMode::ALL, concat(
+                   combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                           framework::dataset::make("DataType", { DataType::F32,
+                                                                  DataType::F16,
+                                                                  DataType::S32
+                                                                })),
+                   combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                           framework::dataset::make("DataType", { DataType::F32,
+                                                                  DataType::F16,
+                                                                  DataType::S32
+                                                                }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_elementwise_unary";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(KernelSelection_elementwise_arithmetic, framework::DatasetMode::ALL, concat(concat(
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16,
+                                                                                                               DataType::QASYMM8,
+                                                                                                               DataType::QASYMM8_SIGNED
+                                                                                                                                            })),
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16
+                                                                                                                                            }))),
+                                                                                           combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                                                                                                   framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                                                                           DataType::QASYMM8_SIGNED
+                                                                                                                                        }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuArithmeticKernel::get_implementation(
+                                    ElementwiseDataTypeISASelectorData{ data_type, cpu_isa, static_cast<int>(ArithmeticOperation::ADD) },
+                                    cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_arithmetic";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(KernelSelection_elementwise_comparison, framework::DatasetMode::ALL, concat(concat(
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16,
+                                                                                                               DataType::U8,
+                                                                                                               DataType::QASYMM8,
+                                                                                                               DataType::QASYMM8_SIGNED
+                                                                                                                                            })),
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16,
+                                                                                                               DataType::U8
+                                                                                                                                            }))),
+                                                                                           combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                                                                                                   framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                                                                           DataType::QASYMM8_SIGNED
+                                                                                                                                        }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuComparisonKernel::get_implementation(
+                                    ElementwiseDataTypeISASelectorData{ data_type, cpu_isa, static_cast<int>(ComparisonOperation::Equal) },
+                                    cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_comparison";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END() // Neon
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseLog.cpp b/tests/validation/NEON/ElementwiseLog.cpp
index 81e6dc87e8..1175903dac 100644
--- a/tests/validation/NEON/ElementwiseLog.cpp
+++ b/tests/validation/NEON/ElementwiseLog.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,15 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
+
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(LogLayer)
@@ -53,6 +62,9 @@ TEST_SUITE(LogLayer)
 template <typename T>
 using NELogLayerFixture = LogValidationFixture<Tensor, Accessor, NELogLayer, T>;
 
+template <typename T>
+using NELogLayerQuantizedFixture = LogQuantizedValidationFixture<Tensor, Accessor, NELogLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -88,8 +100,35 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NELogLayerFixture<float>, framework::DatasetMod
 }
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(10.5, 0), QuantizationInfo(0.5, -10)  })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.75, -128) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(12.5, -2) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // LogLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseMax.cpp b/tests/validation/NEON/ElementwiseMax.cpp
index 449d5db3fc..61421ab3e5 100644
--- a/tests/validation/NEON/ElementwiseMax.cpp
+++ b/tests/validation/NEON/ElementwiseMax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,8 @@ const auto ElementwiseMaxFP16Dataset = combine(combine(framework::dataset::make(
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseMaxFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -111,7 +113,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMaxS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseMaxS32Dataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -119,7 +122,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int32_t>, framework::Da
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxS16Dataset),
+                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -131,11 +135,12 @@ using NEElementwiseMaxQuantizedFixture = ElementwiseMaxValidationQuantizedFixtur
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMaxQASYMM8Dataset),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -144,11 +149,13 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<uint8_t>, fram
 template <typename T>
 using NEElementwiseMaxQuantizedBroadcastFixture = ElementwiseMaxQuantizedBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMax, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseMaxQASYMM8Dataset),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                               ElementwiseMaxQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -156,16 +163,26 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxQuantizedBroadcastFixt
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMaxQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, 20) })),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f, 0) })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })))
+                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })),
+                                                                                                                      OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallInPlace, NEElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                       ElementwiseMaxQASYMM8SignedDataset),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, -20) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, -20) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, -20) })),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-
 TEST_SUITE_END()
 
 TEST_SUITE_END()
@@ -173,7 +190,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxFP16Dataset),
+                                                                                                     InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -182,7 +200,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxFP32Dataset),
+                                                                                                      InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -190,8 +209,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<float>, framework::Data
 template <typename T>
 using NEElementwiseMaxBroadcastFixture = ElementwiseMaxBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMax, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                                                                                                                        ElementwiseMaxFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                                                                                                                        ElementwiseMaxFP32Dataset),
+                                                                                                                        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseMaxBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwiseMaxFP32Dataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -200,7 +227,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ElementwiseMax
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseMin.cpp b/tests/validation/NEON/ElementwiseMin.cpp
index 6678a1b52d..a134eb354d 100644
--- a/tests/validation/NEON/ElementwiseMin.cpp
+++ b/tests/validation/NEON/ElementwiseMin.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,8 @@ const auto ElementwiseMinFP16Dataset = combine(combine(framework::dataset::make(
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseMinFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -110,7 +112,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMinS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseMinS32Dataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -118,7 +121,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int32_t>, framework::Da
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinS16Dataset),
+                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -133,23 +137,34 @@ TEST_SUITE(QASYMM8)
 template <typename T>
 using NEElementwiseMinQuantizedBroadcastFixture = ElementwiseMinQuantizedBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMin, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseMinQASYMM8Dataset),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                               ElementwiseMinQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseMinQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                               ElementwiseMinQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 20) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 20) })),
+                               InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMinQASYMM8Dataset),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) }))
-
-                      )
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -157,11 +172,12 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<uint8_t>, fram
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMaxQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, 20) })),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f, 0) })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })))
+                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })),
+                                                                                                                      OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
@@ -174,7 +190,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinFP16Dataset),
+                                                                                                     InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -183,7 +200,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinFP32Dataset),
+                                                                                                      InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -192,8 +210,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<float>, framework::Data
 template <typename T>
 using NEElementwiseMinBroadcastFixture = ElementwiseMinBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMin, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                                                                                                                        ElementwiseMinFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                                                                                                                        ElementwiseMinFP32Dataset),
+                                                                                                                        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseMinBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwiseMinFP32Dataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -202,7 +228,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ElementwiseMin
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseNegation.cpp b/tests/validation/NEON/ElementwiseNegation.cpp
index ae7dca1ef0..5b8ae8fc64 100644
--- a/tests/validation/NEON/ElementwiseNegation.cpp
+++ b/tests/validation/NEON/ElementwiseNegation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,13 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(NegLayer)
@@ -53,6 +60,9 @@ TEST_SUITE(NegLayer)
 template <typename T>
 using NENegLayerFixture = NegValidationInPlaceFixture<Tensor, Accessor, NENegLayer, T>;
 
+template <typename T>
+using NENegLayerQuantizedFixture = NegQuantizedValidationFixture<Tensor, Accessor, NENegLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -113,8 +123,34 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<int32_t>, framework::DatasetM
 TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.075, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // NegLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwisePower.cpp b/tests/validation/NEON/ElementwisePower.cpp
index beef1c874b..9ac9eec280 100644
--- a/tests/validation/NEON/ElementwisePower.cpp
+++ b/tests/validation/NEON/ElementwisePower.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,8 @@ const auto              ElementwisePowerFP16Dataset = combine(combine(framework:
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwisePowerFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                  framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -91,7 +93,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwisePowerFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwisePowerFP16Dataset),
+                                                                                                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16, 0.01);
@@ -101,13 +104,15 @@ TEST_SUITE_END() // F16
 
 TEST_SUITE(F32)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwisePowerFP32Dataset),
+                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEElementwisePowerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEElementwisePowerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ElementwisePowerFP32Dataset),
+                                                                                                            InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -116,15 +121,23 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEElementwisePowerFixture<float>, framework::Da
 template <typename T>
 using NEElementwisePowerBroadcastFixture = ElementwisePowerBroadcastValidationFixture<Tensor, Accessor, NEElementwisePower, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                       ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwisePowerFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
 }
-
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapesBroadcast(),
-                       ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwisePowerFP32Dataset),
+                       InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
+                       ElementwisePowerFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -133,7 +146,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ElementwisePower
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseRound.cpp b/tests/validation/NEON/ElementwiseRound.cpp
index e0f24128f6..620618cb0b 100644
--- a/tests/validation/NEON/ElementwiseRound.cpp
+++ b/tests/validation/NEON/ElementwiseRound.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -40,12 +40,20 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+} // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(RoundLayer)
 
 template <typename T>
 using NERoundLayerFixture = RoundValidationFixture<Tensor, Accessor, NERoundLayer, T>;
 
+template <typename T>
+using NERoundLayerQuantizedFixture = RoundQuantizedValidationFixture<Tensor, Accessor, NERoundLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -81,8 +89,35 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NERoundLayerFixture<float>, framework::DatasetM
 }
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERoundLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERoundLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.075, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // RoundLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
index 58efa28b31..80788c893f 100644
--- a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
+++ b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,13 +46,42 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(RsqrtLayer)
 
+// Test test cases will execute the function with dynamic-stated shapes
+// Since other elementwise unary operations share the same kernel, this tests are added only here.
+// Also, only FP32 is tested since data type doesn't/shouldn't matter with dynamic shapes.
+TEST_SUITE(DynamicShape)
+TEST_SUITE(FP32)
+
+template <typename T>
+using CpuRsqrtDynamicShapeFixture = RsqrtDynamicShapeValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuRsqrtDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                          DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // DynamicShape
+
 template <typename T>
 using NERsqrtLayerFixture = RsqrtValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
 
+template <typename T>
+using NERsqrtLayerQuantizedFixture = RsqrtQuantizedValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -83,8 +112,34 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NERsqrtLayerFixture<float>, framework::DatasetM
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERsqrtLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(20, 0) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERsqrtLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(25, -128) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // RsqrtLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseSin.cpp b/tests/validation/NEON/ElementwiseSin.cpp
index a2e6cb3760..9c2d7ae268 100644
--- a/tests/validation/NEON/ElementwiseSin.cpp
+++ b/tests/validation/NEON/ElementwiseSin.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,8 @@ AbsoluteTolerance<float> tolerance_fp32(0.00001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 AbsoluteTolerance<float> tolerance_fp16(0.0005f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(SinLayer)
@@ -53,6 +55,9 @@ TEST_SUITE(SinLayer)
 template <typename T>
 using NESinLayerFixture = SinValidationFixture<Tensor, Accessor, NESinLayer, T>;
 
+template <typename T>
+using NESinLayerQuantizedFixture = SinQuantizedValidationFixture<Tensor, Accessor, NESinLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -89,8 +94,34 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESinLayerFixture<float>, framework::DatasetMod
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NESinLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(200, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NESinLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.07, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(123, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // SinLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseSquareDiff.cpp b/tests/validation/NEON/ElementwiseSquareDiff.cpp
index b50db3d4a3..9a86b541de 100644
--- a/tests/validation/NEON/ElementwiseSquareDiff.cpp
+++ b/tests/validation/NEON/ElementwiseSquareDiff.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,6 +68,8 @@ const auto ElementwiseSquaredDiffFP16Dataset = combine(combine(framework::datase
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseSquaredDiffFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                        framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -109,7 +111,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffS32Dataset),
+                                                                                                                      InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -117,7 +120,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, frame
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset),
+                                                                                                                InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -129,13 +133,12 @@ using NEElementwiseSquaredDiffQuantizedFixture = ElementwiseSquaredDiffValidatio
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQASYMM8Dataset),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) }))
-
-                      )
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -144,11 +147,23 @@ template <typename T>
 using NEElementwiseSquaredDiffQuantizedBroadcastFixture = ElementwiseSquaredDiffQuantizedBroadcastValidationFixture<Tensor, Accessor, NEElementwiseSquaredDiff, T>;
 
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                                                       ElementwiseSquaredDiffQASYMM8Dataset),
+                       combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                               ElementwiseSquaredDiffQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseSquaredDiffQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                               ElementwiseSquaredDiffQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -156,11 +171,12 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffQuantizedBroad
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQASYMM8SignedDataset),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f, 5) })),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(.5f, 5) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(.2f, 5) })))
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(.2f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -171,7 +187,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP16Dataset),
+                                                                                                             InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16, 0.01);
@@ -180,7 +197,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -188,15 +206,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framewo
 template <typename T>
 using NEElementwiseSquaredDiffBroadcastFixture = ElementwiseSquaredDiffBroadcastValidationFixture<Tensor, Accessor, NEElementwiseSquaredDiff, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseSquaredDiffFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseSquaredDiffFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapesBroadcast(),
-                       ElementwiseSquaredDiffFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
+                       ElementwiseSquaredDiffFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -205,7 +225,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ElementwiseSquaredDiff
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/EqualizeHistogram.cpp b/tests/validation/NEON/EqualizeHistogram.cpp
deleted file mode 100644
index b781b01714..0000000000
--- a/tests/validation/NEON/EqualizeHistogram.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/EqualizeHistogramFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(EqualizeHistogram)
-template <typename T>
-using NEEqualizeHistogramFixture = EqualizeHistogramValidationFixture<Tensor, Accessor, NEEqualizeHistogram, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEEqualizeHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEEqualizeHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Erode.cpp b/tests/validation/NEON/Erode.cpp
deleted file mode 100644
index aca579b0e3..0000000000
--- a/tests/validation/NEON/Erode.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEErode.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ErodeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Erode)
-
-template <typename T>
-using NEErodeFixture = ErodeValidationFixture<Tensor, Accessor, NEErode, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEErodeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)),
-                                                                                                   datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEErodeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)),
-                                                                                                   datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/FFT.cpp b/tests/validation/NEON/FFT.cpp
index bc528dd9a6..f7ef0a314e 100644
--- a/tests/validation/NEON/FFT.cpp
+++ b/tests/validation/NEON/FFT.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -158,6 +158,8 @@ TEST_SUITE(FFTConvolutionLayer)
 
 template <typename T>
 using NEFFTConvolutionLayerFixture = FFTConvolutionValidationFixture<Tensor, Accessor, NEFFTConvolutionLayer, T>;
+template <typename T>
+using NEFFTConvolutionLayerMixedDataLayoutFixture = FFTConvolutionValidationFixture<Tensor, Accessor, NEFFTConvolutionLayer, T, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
@@ -169,11 +171,18 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEFFTConvolutionLayerFixture<float>, framework:
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32, tolerance_num);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFFTConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFFTConvolutionLayerDataset(),
+                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                 ActivationFunctionsSmallDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32, tolerance_num);
+}
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // FFTConvolutionLayer
-
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/FastCorners.cpp b/tests/validation/NEON/FastCorners.cpp
deleted file mode 100644
index 107702c6e7..0000000000
--- a/tests/validation/NEON/FastCorners.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ImageFileDatasets.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/FastCornersFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Tolerance used to compare corner strengths */
-const AbsoluteTolerance<float> tolerance(0.5f);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(FastCorners)
-
-template <typename T>
-using NEFastCornersFixture = FastCornersValidationFixture<Tensor, Accessor, KeyPointArray, NEFastCorners, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFastCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallImageFiles(), framework::dataset::make("Format", Format::U8)),
-                                                                                                                 framework::dataset::make("SuppressNonMax", { false, true })),
-                                                                                                         framework::dataset::make("BorderMode", BorderMode::UNDEFINED)))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(), array.buffer() + array.num_values(), _reference.begin(), _reference.end(), tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFastCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeImageFiles(), framework::dataset::make("Format", Format::U8)),
-                                                                                                                 framework::dataset::make("SuppressNonMax", { false, true })),
-                                                                                                         framework::dataset::make("BorderMode", BorderMode::UNDEFINED)))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(), array.buffer() + array.num_values(), _reference.begin(), _reference.end(), tolerance);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Fill.cpp b/tests/validation/NEON/Fill.cpp
index 5fe92ca35e..0073f22677 100644
--- a/tests/validation/NEON/Fill.cpp
+++ b/tests/validation/NEON/Fill.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,7 +100,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEFillFixture<uint32_t>, framework::DatasetMode
 }
 TEST_SUITE_END() // U32
 
-
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEFillFixture<int32_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S32)))
 {
@@ -118,7 +117,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEFillFixture<float>, framework::DatasetMode::A
 TEST_SUITE_END() // F32
 
 TEST_SUITE_END() // Fill
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/FillBorder.cpp b/tests/validation/NEON/FillBorder.cpp
index 343ad831e4..928990b2b4 100644
--- a/tests/validation/NEON/FillBorder.cpp
+++ b/tests/validation/NEON/FillBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,10 +60,10 @@ DATA_TEST_CASE(FillBorder, framework::DatasetMode::ALL, combine(combine(combine(
 {
     BorderSize border_size{ static_cast<unsigned int>(size) };
 
-    std::mt19937                           generator(library->seed());
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    const uint8_t                          border_value = distribution_u8(generator);
-    const uint8_t                          tensor_value = distribution_u8(generator);
+    std::mt19937                            generator(library->seed());
+    std::uniform_int_distribution<uint32_t> distribution_u8(0, 255);
+    const uint8_t                           border_value = distribution_u8(generator);
+    const uint8_t                           tensor_value = distribution_u8(generator);
 
     // Create tensors
     Tensor src = create_tensor<Tensor>(shape, data_type);
@@ -77,7 +77,7 @@ DATA_TEST_CASE(FillBorder, framework::DatasetMode::ALL, combine(combine(combine(
     validate(src.info()->padding(), padding);
 
     // Fill tensor with constant value
-    std::uniform_int_distribution<uint8_t> distribution{ tensor_value, tensor_value };
+    std::uniform_int_distribution<uint32_t> distribution{ tensor_value, tensor_value };
     library->fill(Accessor(src), distribution, 0);
 
     // Create and configure kernel
diff --git a/tests/validation/NEON/Flatten.cpp b/tests/validation/NEON/Flatten.cpp
index daadd56a6c..4ebd897c7a 100644
--- a/tests/validation/NEON/Flatten.cpp
+++ b/tests/validation/NEON/Flatten.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,7 +100,7 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // FlattenLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Floor.cpp b/tests/validation/NEON/Floor.cpp
index 419ce56e44..3cd1033ef9 100644
--- a/tests/validation/NEON/Floor.cpp
+++ b/tests/validation/NEON/Floor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -62,6 +65,30 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
     const Status status = NEFloor::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false));
     ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
 }
+
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuFloorKernel::get_implementation(DataTypeISASelectorData{data_type, cpu_isa}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_floor";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/NEON/FullyConnectedLayer.cpp b/tests/validation/NEON/FullyConnectedLayer.cpp
index d8c2203802..ee7e56227d 100644
--- a/tests/validation/NEON/FullyConnectedLayer.cpp
+++ b/tests/validation/NEON/FullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,8 @@
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuFullyConnected.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/FullyConnectedLayerDataset.h"
@@ -40,6 +42,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -56,7 +59,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -64,18 +67,25 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto FullyConnectedParameters = combine(framework::dataset::make("TransposeWeights", { false, true }), framework::dataset::make("ReshapeWeights", { false, true }));
+const auto FullyConnectedParameters = combine(make("TransposeWeights", { false, true }), make("ReshapeWeights", { false, true }));
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(1.f / 256.f, 10),
     QuantizationInfo(1.1f, 10),
 });
-const auto EmptyActivationFunctionDataset = framework::dataset::make("ActivationInfo",
+
+const auto IgnoredQuantizationData = make("IgnoredQuantizationInfo",
+{
+    QuantizationInfo(),
+});
+
+const auto NoActivationFunctionDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
 });
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
@@ -83,7 +93,7 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
 });
 
-const auto ActivationFunctionsQuantizedDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
@@ -94,40 +104,183 @@ const auto ActivationFunctionsQuantizedDataset = framework::dataset::make("Activ
 TEST_SUITE(NEON)
 TEST_SUITE(FullyConnectedLayer)
 
+/** Test case for memory injection in @ref cpu::CpuFullyConnected.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       fc          = std::make_unique<cpu::CpuFullyConnected>();
+    const auto src_info    = TensorInfo(TensorShape(8U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(8U, 4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto fc_info     = FullyConnectedLayerInfo{};
+    fc->configure(&src_info, &weight_info, &bias_info, &dst_info, fc_info);
+
+    // telhs are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(fc->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        fc->prepare(prep_pack);
+        fc->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEFullyConnectedLayer.
+ *
+ * Make sure @ref NEFullyConnectedLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       fc          = std::make_unique<NEFullyConnectedLayer>();
+    const auto src_info    = TensorInfo(TensorShape(8U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(8U, 4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto fc_info     = FullyConnectedLayerInfo{};
+    auto       run_conv    = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        fc->configure(&src, &weight, &bias, &dst, fc_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        fc->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Unit test for @ref cpu::CpuFullyConnected with quantized multipler > 1
+ *
+ * Tests output correctness.
+ */
+TEST_CASE(Quant8_Signed_Mult_gt_1, framework::DatasetMode::ALL)
+{
+    auto       fc          = std::make_unique<cpu::CpuFullyConnected>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 3U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(0.5f, -1));
+    const auto weight_info = TensorInfo(TensorShape(1U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(0.5, -8));
+    const auto bias_info   = TensorInfo(TensorShape(1U), 1, DataType::S32);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 3U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(0.1f, 0));
+    const auto fc_info     = FullyConnectedLayerInfo{};
+    fc->configure(&src_info, &weight_info, &bias_info, &dst_info, fc_info);
+
+    // telhs are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    auto dst    = create_tensor<Tensor>(dst_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias }, { TensorType::ACL_DST, &dst } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(fc->workspace(), mg, run_pack, prep_pack);
+
+    // Initialize input values
+    const std::vector<int8_t>  src_values    = { 3, 63, 31 };
+    const std::vector<int8_t>  weight_values = { -4 };
+    const std::vector<int32_t> bias_values   = { 16 };
+    const std::vector<int32_t> expected      = { 80, 127, 127 };
+    library->fill_static_values(Accessor(src), src_values);
+    library->fill_static_values(Accessor(weight), weight_values);
+    library->fill_static_values(Accessor(bias), bias_values);
+
+    // Run FC layer
+    fc->prepare(prep_pack);
+    fc->run(run_pack);
+
+    auto dst_ptr = reinterpret_cast<int8_t *>(dst.buffer());
+    for(size_t i = 0; i < dst.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(dst_ptr[i] == expected[i], framework::LogLevel::ERRORS);
+    }
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
+    make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Invalid weights dimensions
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Wrongly reshaped weights
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                           }),
-    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
+    make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
+    make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
+    make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                            })),
-    framework::dataset::make("TransposeWeights",{ true, true, false, true, true, true })),
-    framework::dataset::make("ReshapedWeights",{ false, false, false, false, false , false})),
-    framework::dataset::make("Expected", { false, true, true, false, false, true })),
+    make("TransposeWeights",{ true, true, false, true, true, true })),
+    make("ReshapedWeights",{ false, false, false, false, false , false})),
+    make("Expected", { false, true, true, false, false, true })),
     input_info, weights_info, bias_info, output_info, transpose_weights, reshaped_weights, expected)
 {
     // Create Fully Connected layer info
@@ -143,130 +296,251 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
 
 template <typename T>
 using NEFullyConnectedLayerFixture = FullyConnectedLayerValidationFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
+template <typename T>
+using NEFullyConnectedLayerMixedDataLayoutFixture = FullyConnectedLayerValidationFixture<Tensor, Accessor, NEFullyConnectedLayer, T, true>;
+template <typename T>
+using NEFullyConnectedLayerDynamicWeightsFixture = FullyConnectedWithDynamicWeightsFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
+template <typename T>
+using NEFullyConnectedLayerDynamicBiasFixture = FullyConnectedWithDynamicBiasFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                                                                                                                        FullyConnectedParameters),
-                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                        FullyConnectedParameters,
+                                                                                                                        make("DataType", DataType::F16),
+                                                                                                                NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::F16)),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::F16),
                        ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(),
-                                                                                                                      FullyConnectedParameters),
-                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
-                                                                                                              EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(),
+                                                                                                                      FullyConnectedParameters,
+                                                                                                                      make("DataType", DataType::F16),
+                                                                                                              NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F16),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false, true })))
+{
+}
 TEST_SUITE_END()
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                 EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                 make("DataType", DataType::F32),
+                                                                                                                 NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(
-                           combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(
+                           make("Input", TensorShape(9U, 5U, 7U)),
+                           make("Weights", TensorShape(315U, 271U)),
+                       make("Biases", TensorShape(271U)),
+                       make("Output", TensorShape(271U)),
+                       FullyConnectedParameters,
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::FullyConnectedLayerWithActivationDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::F32),
                        ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
-                                                                                                               EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                       make("DataType", DataType::F32),
+                                                                                                               NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false, true })))
+{
+}
 TEST_SUITE_END()
 TEST_SUITE_END()
 
 template <typename T>
 using NEFullyConnectedLayerQuantizedFixture = FullyConnectedLayerValidationQuantizedFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
+template <typename T>
+using NEFullyConnectedLayerQuantizedMixedDataLayoutFixture = FullyConnectedLayerValidationQuantizedFixture<Tensor, Accessor, NEFullyConnectedLayer, T, true>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
-                           combine(datasets::SmallFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8),
+                       QuantizationData,
                        ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeightsWithActivation, NEFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false })))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicBiasWithActivation, NEFullyConnectedLayerDynamicBiasFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+}
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(
-                           combine(datasets::LargeFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+// Dynamic Quantization Tests here
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                           combine(datasets::SmallFullyConnectedLayerDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8),
+                       IgnoredQuantizationData,
+                       NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END()
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(
+                           datasets::LargeFullyConnectedLayerDataset(),
+                            FullyConnectedParameters,
+                           framework::dataset::make("DataType", DataType::QASYMM8),
+                       QuantizationData,
+                       NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicBias, NEFullyConnectedLayerDynamicBiasFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       NoActivationFunctionDataset))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       IgnoredQuantizationData,
+                               NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       NoActivationFunctionDataset,
+                       make("WeightsReshaped", { false })))
+{
+}
+TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
-                           combine(datasets::SmallFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       QuantizationData),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8_SIGNED),
+                       QuantizationData,
                        ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+FIXTURE_DATA_TEST_CASE(RunDynamicWeightsWithActivation, NEFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false })))
+{
+}
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+// Dynamic Quantization tests
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(
+                           datasets::SmallFullyConnectedLayerDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8_SIGNED),
+                       IgnoredQuantizationData,
+                       NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       NoActivationFunctionDataset,
+                       make("WeightsReshaped", { false })))
+{
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // FullyConnectedLayer
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/FuseBatchNormalization.cpp b/tests/validation/NEON/FuseBatchNormalization.cpp
index 9fc3353aaf..62265c6ac9 100644
--- a/tests/validation/NEON/FuseBatchNormalization.cpp
+++ b/tests/validation/NEON/FuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -214,7 +214,7 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // DepthwiseConvolution
 TEST_SUITE_END() // FuseBatchNormalization
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index 2d8c61164b..5f6a402204 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,12 +22,15 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
+#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
+#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
+#include "src/cpu/operators/CpuGemm.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -48,6 +51,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace
 {
 constexpr AbsoluteTolerance<float> tolerance_f(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */
@@ -57,7 +62,7 @@ const AbsoluteTolerance<float>      abs_tolerance_f16(0.2f);      /**< Absolute
 constexpr float                     tolerance_num = 0.07f;        /**< Tolerance number for FP16 data types */
 #endif                                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -65,62 +70,210 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12);
-const auto data_transpose  = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);
+const auto data_interleave = make("M", 8, 12) * make("N", 8, 12);
+const auto data_transpose  = make("M", 8, 14) * make("N", 7, 14);
 
 /** Zero padding test */
 template <typename FunctionType>
 bool validate_zero_padding(unsigned int dim0_value, unsigned int dim1_value)
 {
     const TensorShape in_shape(dim0_value, dim1_value);
+    TensorInfo        in(in_shape, 1, DataType::U32);
+    TensorInfo        dst;
 
-    // Create tensors
-    Tensor in = create_tensor<Tensor>(in_shape, DataType::U32);
-    Tensor dst;
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(in.is_resizable(), framework::LogLevel::ERRORS);
 
     // Validate zero-padding
     FunctionType func;
 
     func.configure(&in, &dst);
 
-    return in.info()->padding().empty();
+    return in.padding().empty();
 }
 
 /* Zero padding test for GEMM kernels */
 bool validate_gemm_zero_padding(const TensorShape shape0, const TensorShape shape1)
 {
     // Create tensors
-    Tensor in0 = create_tensor<Tensor>(shape0, DataType::F32);
-    Tensor in1 = create_tensor<Tensor>(shape1, DataType::F32);
-    Tensor dst;
+    TensorInfo in0(shape0, 1, DataType::F32);
+    TensorInfo in1(shape1, 1, DataType::F32);
+    TensorInfo dst;
 
     // Validate zero-padding
-    NEGEMMMatrixMultiplyKernel gemm;
+    cpu::kernels::CpuGemmMatrixMultiplyKernel gemm;
     gemm.configure(&in0, &in1, &dst, 1.0, false);
 
-    return in0.info()->padding().empty() && in1.info()->padding().empty() && dst.info()->padding().empty();
+    return in0.padding().empty() && in1.padding().empty() && dst.padding().empty();
 }
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(GEMM)
 
+/** Test case for memory injection in @ref cpu::CpuGemm.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       gemm      = std::make_unique<cpu::CpuGemm>();
+    const auto lhs_info  = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);
+    const auto rhs_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto c_info    = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    auto       dst_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto gemm_info = GEMMInfo{};
+    gemm->configure(&lhs_info, &rhs_info, &c_info, &dst_info, 1.f, 1.f, gemm_info);
+
+    // telhs are newly created every call of this lambda function
+    auto lhs = create_tensor<Tensor>(lhs_info);
+    auto rhs = create_tensor<Tensor>(rhs_info);
+    auto c   = create_tensor<Tensor>(c_info);
+    lhs.allocator()->allocate();
+    rhs.allocator()->allocate();
+    c.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &lhs }, { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(lhs), 1.f);
+        library->fill_tensor_value(Accessor(rhs), 2.f);
+        library->fill_tensor_value(Accessor(c), 3.f);
+        // This operator is configured once and captured by this lambda.
+        gemm->prepare(prep_pack);
+        gemm->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMM.
+ *
+ * Make sure @ref NEGEMM still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       gemm      = std::make_unique<NEGEMM>();
+    const auto lhs_info  = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);
+    const auto rhs_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto c_info    = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    auto       dst_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto gemm_info = GEMMInfo{};
+    auto       run_conv  = [&]()
+    {
+        auto lhs = create_tensor<Tensor>(lhs_info);
+        auto rhs = create_tensor<Tensor>(rhs_info);
+        auto c   = create_tensor<Tensor>(c_info);
+        auto dst = create_tensor<Tensor>(dst_info);
+        gemm->configure(&lhs, &rhs, &c, &dst, 1.f, 1.f, gemm_info);
+        lhs.allocator()->allocate();
+        rhs.allocator()->allocate();
+        c.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(lhs), 1.f);
+        library->fill_tensor_value(Accessor(rhs), 2.f);
+        library->fill_tensor_value(Accessor(c), 3.f);
+        gemm->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+               make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type
+                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),
+                                                     }),
+               make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32),
+                                                        TensorInfo(TensorShape(8U, 27U), 1, DataType::F32),
+                                                     })),
+               make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32),
+                                                        TensorInfo(TensorShape(8U, 13U), 1, DataType::F32),
+                                                     })),
+               make("Expected", { false, true })),
+               lhs_info, rhs_info, output_info, expected)
+{
+    constexpr float alpha = 1.0;
+    constexpr float beta = 0.0;
+    const auto gemm_info = GEMMInfo();
+    bool is_valid = bool(NEGEMM::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), alpha, beta, gemm_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE(KERNEL_SELECTION)
+DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL,
+               combine(make("CpuExt", std::string("NEON")),
+                       make("DataType", { DataType::F32,
+                                                              DataType::F16
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl_mul = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_mul);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_mul";
+    std::string actual   = selected_impl_mul->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+
+    const auto *selected_impl_add = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_add);
+
+    expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_add";
+    actual   = selected_impl_add->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // KERNEL_SELECTION
+
 TEST_SUITE(TRANSPOSE_1XW)
-using NEGEMMTranspose1xW = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMTranspose1xWKernel, 4>;
+using CpuGemmTranspose1xW = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmTranspose1xWKernel>;
 DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
-                   framework::dataset::make("N", { 1, 23, 63, 101 }),
-                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+                   make("N", { 1, 23, 63, 101 }),
+                   make("K", { 1, 47, 29, 27 })),
                n_value, k_value)
 {
-    bool status = validate_zero_padding<NEGEMMTranspose1xWKernel>(n_value, k_value);
+    bool status = validate_zero_padding<CpuGemmTranspose1xW>(n_value, k_value);
     ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
 TEST_SUITE(U32)
-using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint32_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U32))
+using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -128,8 +281,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMo
 TEST_SUITE_END() // U32
 
 TEST_SUITE(U16)
-using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint16_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U16))
+using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U16))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -137,8 +290,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMo
 TEST_SUITE_END() // U16
 
 TEST_SUITE(U8)
-using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint8_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U8))
+using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U8))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -148,20 +301,20 @@ TEST_SUITE_END() // U8
 TEST_SUITE_END() // TRANSPOSE_1XW
 
 TEST_SUITE(INTERLEAVE_4X4)
-using NEGEMMInterleave4x4 = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMInterleave4x4Kernel, 4>;
+using CpuGemmInterleave4x4 = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmInterleave4x4Kernel>;
 
 DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
-                   framework::dataset::make("M", { 1, 23, 63, 101 }),
-                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+                   make("M", { 1, 23, 63, 101 }),
+                   make("K", { 1, 47, 29, 27 })),
                m_value, k_value)
 {
-    bool status = validate_zero_padding<NEGEMMInterleave4x4Kernel>(m_value, k_value);
+    bool status = validate_zero_padding<cpu::kernels::CpuGemmInterleave4x4Kernel>(m_value, k_value);
     ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
 TEST_SUITE(U32)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint32_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U32))
+using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -169,8 +322,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetM
 TEST_SUITE_END() // U32
 
 TEST_SUITE(U16)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint16_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U16))
+using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U16))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -178,8 +331,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetM
 TEST_SUITE_END() // U16
 
 TEST_SUITE(U8)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint8_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::QASYMM8))
+using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::QASYMM8))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -188,23 +341,24 @@ TEST_SUITE_END() // U8
 
 TEST_SUITE_END() // INTERLEAVE_4X4
 
-//TODO(COMPMID-415): Validate valid region
-
 template <typename T>
 using NEGEMMFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T>;
 
 template <typename T>
-using NEGEMMFixtureDisabledC = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true>;
+using NEBatchedMatMulFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true, false, false, false, false, true>;
+
+template <typename T>
+using NEGEMMAccumulateFixture = GEMMAccumulateValidationFixture<Tensor, Accessor, NEGEMM, T>;
 
 TEST_SUITE(Float)
-DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(framework::dataset::make("In0", { TensorShape(21U, 13U),
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(make("In0", { TensorShape(21U, 13U),
                                                                                                        TensorShape(31U, 1U),
                                                                                                        TensorShape(31U, 1U),
                                                                                                        TensorShape(8U, 2U),
                                                                                                        TensorShape(38U, 12U),
                                                                                                        TensorShape(32U, 1U)
                                                                                                      }),
-                                                                     framework::dataset::make("In1", { TensorShape(33U, 21U),
+                                                                     make("In1", { TensorShape(33U, 21U),
                                                                                                        TensorShape(23U, 31U),
                                                                                                        TensorShape(23U, 31U),
                                                                                                        TensorShape(16U, 8U),
@@ -217,59 +371,111 @@ DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(framework::
     ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
+DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
+                                                                     zip(make("In0",{ TensorShape(21U, 13U) }),
+                                                                     make("In1", { TensorShape(33U, 21U) }),
+                                                                     make("Dst", { TensorShape(33U, 13U) })),
+                                                                     zip(
+                                                                     make("alpha", { 1.0, 100.0, 1.0, 1.0 }),
+                                                                     make("beta", { 0.0, 0.0, 1.0, 1.0 }),
+                                                                     make("is_c_null", { false, false, false, true }),
+                                                                     make("Expected", { true, false, false, true }))),
+               shape_a, shape_b, shape_dst, alpha, beta, is_c_null, expected)
+{
+    /* Accumulation test for GEMM kernels */
+    // Create tensors
+    TensorInfo in_a(shape_a, 1, DataType::F32);
+    TensorInfo in_b(shape_b, 1, DataType::F32);
+    TensorInfo in_c(shape_dst, 1, DataType::F32);
+    TensorInfo dst(shape_dst, 1, DataType::F32);
+
+    GEMMInfo gemm_info = GEMMInfo();
+    gemm_info.set_accumulate(true);
+
+    // Validate accumulation
+    cpu::CpuGemm gemm;
+    Status status = gemm.validate(&in_a, &in_b, (is_c_null ? nullptr : &in_c), &dst, alpha, beta, gemm_info);
+    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                         framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                 framework::dataset::make("DataType", DataType::F16)))
+                                                                                                         make("ReshapeWeights", { true, false })),
+                                                                                                 make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
-                                                                                                       framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                       make("ReshapeWeights", { true, false })),
+                                                                                               make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+}
 
-                                                                                               framework::dataset::make("DataType", DataType::F16)))
+TEST_SUITE(BATCHED_MATMUL)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
+                                                                                                                  make("ReshapeWeights", { false })),
+                                                                                                          make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // BATCHED_MATMUL
+
+TEST_SUITE_END() // FP16
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                          framework::dataset::make("ReshapeWeights", { true, false })),
-
-                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+                                                                                                          make("ReshapeWeights", { true, false })),
+                                                                                                  make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
-                                                                                                        framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                        make("ReshapeWeights", { true, false })),
+                                                                                                make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f);
+}
 
-                                                                                                framework::dataset::make("DataType", DataType::F32)))
+TEST_SUITE(BATCHED_MATMUL)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
+                                                                                                                   make("ReshapeWeights", { false })),
+                                                                                                           make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-TEST_SUITE(DisabledC)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixtureDisabledC<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true, false })),
+TEST_SUITE_END() // BATCHED_MATMUL
 
-                                                                                                           framework::dataset::make("DataType", DataType::F32)))
+TEST_SUITE(ACCUMULATE)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAccumulateFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallAccumulateGEMMDataset(),
+                                                                                                        make("ReshapeWeights", { false }),
+                                                                                                        make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMAccumulateFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeAccumulateGEMMDataset(),
+                                                                                                        make("ReshapeWeights", { false }),
+                                                                                                        make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // ACCUMULATE
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // GEMM
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 04282c2c3c..d25f43a330 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -37,7 +39,6 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/GEMMLowpAssemblyFixture.h"
 #include "tests/validation/fixtures/GEMMLowpFixture.h"
 
 namespace arm_compute
@@ -46,15 +47,26 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace
 {
-const auto data_matrix_multiply = framework::dataset::make("M", 12, 20) * framework::dataset::make("N", 12, 20) * framework::dataset::make("K", 16);
+    constexpr AbsoluteTolerance<float> tolerance_batched(1);
+    constexpr AbsoluteTolerance<float> tolerance_quant(1);
 } // namespace
 
+
 TEST_SUITE(NEON)
 TEST_SUITE(GEMMLowp)
 TEST_SUITE(MatrixMultiplyCore)
+
 using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpMatrixMultiplyCoreAccumulateFixture = GEMMLowpMatrixMultiplyAccumulateValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpBatchedMatMulFixture      = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, true>;
+using NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture = GEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpDequantizedMatrixMultiplyValidationFixture = GEMMLowpDequantizedMatrixMultiplyValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+
+using framework::dataset::make;
 
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
                shape_a, shape_b, shape_c, a_offset, b_offset)
@@ -80,29 +92,69 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::c
     validate(b.info()->padding(), PaddingSize());
     validate(c.info()->padding(), PaddingSize());
 }
+// accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __aarch64__
+DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("In0",{ TensorShape(21U, 1U) }),
+                                                                     make("In1", { TensorShape(1U, 21U) }),
+                                                                     make("Dst", { TensorShape(1U, 1U) }),
+                                                                     make("a_offset", { -2 }),
+                                                                     make("a_offset", { 13 })
+                                                                    ),
+                                                                    zip(
+                                                                     make("OutputDataType", {  DataType::S32,  DataType::QASYMM8, DataType::QASYMM8_SIGNED}),
+                                                                     make("Expected", { true, false, false })
+                                                                    )),
+               shape_a, shape_b, shape_dst, a_offset, b_offset, output_data_type, expected)
+{
+    DataType input_data_type = (output_data_type == DataType::S32 ? DataType::QASYMM8 : output_data_type);
+    // Accumulation test for GEMM kernels
+    TensorInfo a(shape_a, 1, input_data_type, QuantizationInfo(1.0f / 255, a_offset));
+    TensorInfo b(shape_b, 1, input_data_type, QuantizationInfo(1.0f / 255, b_offset));
+    TensorInfo dst(shape_dst, 1, output_data_type, QuantizationInfo());
+
+    // Create and configure function
+    GEMMInfo gemm_info = GEMMInfo();
+    gemm_info.set_accumulate(true);
+
+    if (is_data_type_quantized(output_data_type))
+    {
+        GEMMLowpOutputStageInfo gemmLowpOutputStageInfo = GEMMLowpOutputStageInfo();
+        gemmLowpOutputStageInfo.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+
+        gemm_info.set_gemmlowp_output_stage(gemmLowpOutputStageInfo);
+    }
+
+    cpu::CpuGemmLowpMatrixMultiplyCore gemmlowp_mm;
+    Status status = gemmlowp_mm.validate(&a, &b, nullptr, &dst, gemm_info);
+
+    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
+}
+#endif // __arch64__
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
                                              TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),                                 // Mismatching data type
                                              TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
                                              TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
                                              TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)),
                                           }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+    make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                          }),
+    make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(8U, 11U), 1, DataType::S32),
                                             TensorInfo(TensorShape(64U, 32U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Expected", { true, false, false, false, true })),
+                                           }),
+    make("Expected", { true, false, false, false, true })),
     a_info, b_info, output_info, expected)
 {
     // Lock tensors
@@ -115,442 +167,222 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+/** Test case for memory injection in @ref cpu::CpuGemmLowpMatrixMultiplyCore.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
 {
-    // Validate output
-    validate(Accessor(_target), _reference);
+    auto gemm     = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+    auto a_info   = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);
+    auto b_info   = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);
+    auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);
+    a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));
+    b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));
+    const auto gemm_info = GEMMInfo{};
+    gemm->configure(&a_info, &b_info, nullptr, &dst_info, gemm_info);
+
+    // telhs are newly created every call of this lambda function
+    auto a   = create_tensor<Tensor>(a_info);
+    auto b   = create_tensor<Tensor>(b_info);
+    auto dst = create_tensor<Tensor>(dst_info);
+    a.allocator()->allocate();
+    b.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    ITensorPack run_pack =
+    {
+        { TensorType::ACL_SRC_0, &a },
+        { TensorType::ACL_SRC_1, &b },
+        { TensorType::ACL_DST, &dst }
+    };
+    ITensorPack prep_pack =
+    {
+        { TensorType::ACL_SRC_1, &b },
+    };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));
+        library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));
+        // This operator is configured once and captured by this lambda.
+        gemm->prepare(prep_pack);
+        gemm->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+/** Test case for memory injection in @ref NEGEMMLowpMatrixMultiplyCore.
+ *
+ * Make sure @ref NEGEMMLowpMatrixMultiplyCore still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
 {
-    // Validate output
-    validate(Accessor(_target), _reference);
+    auto gemm     = std::make_unique<NEGEMMLowpMatrixMultiplyCore>();
+    auto a_info   = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);
+    auto b_info   = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);
+    auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);
+    a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));
+    b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));
+    const auto gemm_info = GEMMInfo{};
+    auto       run_conv  = [&]()
+    {
+        auto a   = create_tensor<Tensor>(a_info);
+        auto b   = create_tensor<Tensor>(b_info);
+        auto dst = create_tensor<Tensor>(dst_info);
+        gemm->configure(&a, &b, nullptr, &dst, gemm_info);
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));
+        library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));
+        gemm->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
 }
 
-using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
-TEST_SUITE(FusedOffsetOutput)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // FusedOffsetOutput
-TEST_SUITE_END() // MatrixMultiplyCore
-
-TEST_SUITE(OutputStage)
-
-TEST_SUITE(QuantizeDownInt32Scale)
 
+TEST_SUITE(BatchedMatMul)
 TEST_SUITE(QASYMM8)
-
-const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                      3)
-                                                      * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                           2)
-                                                           * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpOutputStage>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        0,
-                                            13,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-
-    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-    output_stage.type        = GEMMLowpOutputStageType::QUANTIZE_DOWN;
-    output_stage.gemmlowp_min_bound        = min;
-    output_stage.gemmlowp_max_bound        = max;
-    output_stage.output_data_type = DataType::QASYMM8;
-
-    // Lock tensors
-    Status status =  NEGEMMLowpOutputStage::validate(&a_info.clone()->set_is_resizable(false),
-                                                                     &b_info.clone()->set_is_resizable(false),
-                                                                     &output_info.clone()->set_is_resizable(false),
-                                                                     output_stage);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_CASE(NoPaddingAdded, framework::DatasetMode::PRECOMMIT)
-{
-    Tensor input1 = create_tensor<Tensor>(TensorShape(21U, 13U), DataType::S32);
-    Tensor input2 = create_tensor<Tensor>(TensorShape(21U, 1U), DataType::S32);
-    Tensor output = create_tensor<Tensor>(TensorShape(21U, 13U), DataType::QASYMM8);
-
-    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-    output_stage.type                    = GEMMLowpOutputStageType::QUANTIZE_DOWN;
-    output_stage.gemmlowp_min_bound      = 0;
-    output_stage.gemmlowp_max_bound      = 205;
-    output_stage.output_data_type        = DataType::QASYMM8;
-
-    NEGEMMLowpOutputStage f;
-    f.configure(&input1, &input2, &output, output_stage);
-
-    // Validate padding is zero
-    validate(input1.info()->padding(), PaddingSize());
-    validate(input2.info()->padding(), PaddingSize());
-    validate(output.info()->padding(), PaddingSize());
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
-    // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_batched);
 }
-
-TEST_SUITE_END() // BoundedReLu
-
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-
-const auto quantize_down_int32_to_int8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                     3)
-                                                     * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                          2)
-                                                          * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", -100, -98) * framework::dataset::make("max", 71, 74) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToInt8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpOutputStage>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Invalid min and max
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-                                            TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        -10,
-                                            -200,
-                                            -113,
-                                           })),
-    framework::dataset::make("Max",{        105,
-                                            300,
-                                            -18,
-                                           })),
-    framework::dataset::make("Expected", { true, false, false })),
-    a_info, b_info, output_info, min, max, expected)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8_SIGNED }),
+        make("reshape_b_only_on_first_run", { false })))
 {
-    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-    output_stage.type        = GEMMLowpOutputStageType::QUANTIZE_DOWN;
-    output_stage.gemmlowp_min_bound        = min;
-    output_stage.gemmlowp_max_bound        = max;
-    output_stage.output_data_type = DataType::QASYMM8_SIGNED;
-
-    // Lock tensors
-    Status status =  NEGEMMLowpOutputStage::validate(&a_info.clone()->set_is_resizable(false),
-                                                                     &b_info.clone()->set_is_resizable(false),
-                                                                     &output_info.clone()->set_is_resizable(false),
-                                                                     output_stage);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    validate(Accessor(_target), _reference, tolerance_batched);
 }
-// clang-format on
-// *INDENT-ON*
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // BatchedMatMul
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_cases))
+TEST_SUITE(FusedOffsetOutput)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_quant);
 }
-
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END() // BoundedReLu
-
-TEST_SUITE_END() // QASYMM8_SIGNED
-
-TEST_SUITE_END() // QuantizeDownInt32Scale
-
-TEST_SUITE(QuantizeDownInt32ToUint8ScaleByFixedPoint)
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint>;
-
-using NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        0,
-                                            13,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
-                                                                                 &b_info.clone()->set_is_resizable(false),
-                                                                                 &output_info.clone()->set_is_resizable(false),
-                                                                                 min,
-                                                                                 max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    validate(Accessor(_target), _reference, tolerance_quant);
 }
-// clang-format on
-// *INDENT-ON*
+TEST_SUITE_END() // FusedOffsetOutput
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
+// accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __aarch64__
+TEST_SUITE(ACCUMULATION)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreAccumulateFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreAccumulateFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
+TEST_SUITE_END() // S32
+TEST_SUITE_END() // ACCUMULATION
+#endif // __arch64__
 
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
+TEST_SUITE(DynamicQuantization)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // BoundedReLu
-
-TEST_SUITE_END() // QuantizeDownInt32ToUint8ScaleByFixedPoint
-
-TEST_SUITE(QuantizeDownInt32ToInt8ScaleByFixedPoint)
-
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                   2)
-                                                                   * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128) * framework::dataset::make("max", 128) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                        2)
-                                                                        * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-        framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::F32), // Invalid input data type
-                                                 TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                                 TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),
-        }),
-        framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                                TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                                TensorInfo(TensorShape(21U), 1, DataType::S32),
-        })),
-        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-                                                TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                                TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-        })),
-        framework::dataset::make("Min",{ -110,
-                                         -113,
-                                         -113,
-        })),
-        framework::dataset::make("Max",{ 87,
-                                         97,
-                                         97,
-        })),
-        framework::dataset::make("Expected", { false, false, true })),
-               a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
-                                                                                  &b_info.clone()->set_is_resizable(false),
-                                                                                  &output_info.clone()->set_is_resizable(false),
-                                                                                  min,
-                                                                                  max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_cases))
+TEST_SUITE_END() // DynamicQuantization
+
+#ifdef __aarch64__
+// Deqaunt tests involve returning F32 from the MatrixMultiplyCore kernels and is only implemented in aarch64
+TEST_SUITE(Dequant)
+constexpr AbsoluteTolerance<float> tolerance_dequantized(0.01f);
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallGEMMLowpDataset(),
+        make("accumulate", {true, false})
+    ))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_dequantized);
 }
 
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeGEMMLowpDataset(),
+        make("accumulate", {false})
+    ))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QuantizeDownInt32ToInt8ScaleByFixedPoint
-
-TEST_SUITE(QuantizeDownInt32ToInt16ScaleByFixedPoint)
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823,
-                                                                                                        1073741825)
-                                                                               * framework::dataset::make("result_shift", -3,
-                                                                                                          -2)
-                                                                               * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600,
-                                                                                                             254601602)
-                                                                                    * framework::dataset::make("result_shift", -3,
-                                                                                                               -1)
-                                                                                    * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QSYMM16),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        -205,
-                                            -180,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
-                                                                                 &b_info.clone()->set_is_resizable(false),
-                                                                                 &output_info.clone()->set_is_resizable(false),
-                                                                                 min,
-                                                                                 max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    validate(Accessor(_target), _reference, tolerance_dequantized);
 }
-// clang-format on
-// *INDENT-ON*
+TEST_SUITE_END() // Dequant
+#endif // __aarch64__
 
-TEST_SUITE(NoRelu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // NoRelu
-TEST_SUITE(BoundedReLu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QuantizeDownInt32ToInt16ScaleByFixedPoint
-TEST_SUITE_END() // OutputStage
+TEST_SUITE_END() // MatrixMultiplyCore
 TEST_SUITE_END() // GEMMLowp
 TEST_SUITE_END() // NEON
 } // namespace validation
diff --git a/tests/validation/NEON/Gather.cpp b/tests/validation/NEON/Gather.cpp
index d2d5df802a..0aea19939e 100644
--- a/tests/validation/NEON/Gather.cpp
+++ b/tests/validation/NEON/Gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,12 +100,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 template <typename T>
 using NEGatherFixture = GatherFixture<Tensor, Accessor, NEGather, T>;
 
+const auto gather_small_shapes = arm_compute::test::framework::dataset::concat(datasets::SmallGatherDataset(), datasets::SmallGatherMultiDimIndicesDataset());
+
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::F16)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -125,7 +127,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::F32)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -146,7 +148,7 @@ TEST_SUITE(U8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::U8)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::U8)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -166,7 +168,7 @@ TEST_SUITE(U16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<uint16_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::U16)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::U16)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -183,7 +185,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 TEST_SUITE_END() // U16
 
 TEST_SUITE_END() // Gather
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Gaussian3x3.cpp b/tests/validation/NEON/Gaussian3x3.cpp
deleted file mode 100644
index efc555e279..0000000000
--- a/tests/validation/NEON/Gaussian3x3.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Gaussian3x3Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /** Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /** Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Gaussian3x3)
-
-template <typename T>
-using NEGaussian3x3Fixture = Gaussian3x3ValidationFixture<Tensor, Accessor, NEGaussian3x3, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGaussian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGaussian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Gaussian5x5.cpp b/tests/validation/NEON/Gaussian5x5.cpp
deleted file mode 100644
index 6352333f12..0000000000
--- a/tests/validation/NEON/Gaussian5x5.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Gaussian5x5Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 5;              /** Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /** Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Gaussian5x5)
-
-template <typename T>
-using NEGaussian5x5Fixture = Gaussian5x5ValidationFixture<Tensor, Accessor, NEGaussian5x5, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGaussian5x5Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGaussian5x5Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/GaussianPyramid.cpp b/tests/validation/NEON/GaussianPyramid.cpp
deleted file mode 100644
index 9639cbfd91..0000000000
--- a/tests/validation/NEON/GaussianPyramid.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/GaussianPyramidHalfFixture.h"
-#include "tests/validation/reference/Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto small_gaussian_pyramid_levels = combine(datasets::Medium2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 4);
-const auto large_gaussian_pyramid_levels = combine(datasets::Large2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 5);
-
-template <typename T>
-inline void validate_gaussian_pyramid(const Pyramid &target, const std::vector<SimpleTensor<T>> &reference, BorderMode border_mode)
-{
-    ValidRegion prev_valid_region = shape_to_valid_region(reference[0].shape());
-
-    for(size_t i = 1; i < reference.size(); ++i)
-    {
-        const ValidRegion valid_region = shape_to_valid_region_gaussian_pyramid_half(reference[i - 1].shape(), prev_valid_region, (border_mode == BorderMode::UNDEFINED));
-
-        // Validate outputs
-        validate(Accessor(*(target.get_pyramid_level(i))), reference[i], valid_region);
-
-        // Keep the valid region for the next level
-        prev_valid_region = valid_region;
-    }
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(GaussianPyramid)
-TEST_SUITE(Half)
-
-template <typename T>
-using NEGaussianPyramidHalfFixture = GaussianPyramidHalfValidationFixture<Tensor, Accessor, NEGaussianPyramidHalf, T, Pyramid>;
-
-FIXTURE_DATA_TEST_CASE(RunSmallGaussianPyramidHalf, NEGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::NIGHTLY, small_gaussian_pyramid_levels)
-{
-    validate_gaussian_pyramid(_target, _reference, _border_mode);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLargeGaussianPyramidHalf, NEGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::NIGHTLY, large_gaussian_pyramid_levels)
-{
-    validate_gaussian_pyramid(_target, _reference, _border_mode);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/GenerateProposalsLayer.cpp b/tests/validation/NEON/GenerateProposalsLayer.cpp
index d14b8cb8c4..960c2054e7 100644
--- a/tests/validation/NEON/GenerateProposalsLayer.cpp
+++ b/tests/validation/NEON/GenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,13 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h"
 #include "arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
+#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "tests/Globals.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/ArrayAccessor.h"
+#include "tests/NEON/Helper.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
@@ -43,6 +44,8 @@ namespace validation
 {
 namespace
 {
+using NEComputeAllAnchors = NESynthetizeFunction<NEComputeAllAnchorsKernel>;
+
 template <typename U, typename T>
 inline void fill_tensor(U &&tensor, const std::vector<T> &v)
 {
@@ -412,7 +415,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // GenerateProposals
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/HOGDescriptor.cpp b/tests/validation/NEON/HOGDescriptor.cpp
deleted file mode 100644
index f6c7ed310b..0000000000
--- a/tests/validation/NEON/HOGDescriptor.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/HOG.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/HOGDescriptorDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HOGDescriptorFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-RelativeTolerance<float> tolerance(0.1f);
-constexpr float          tolerance_number = 0.05f;
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(HOGDescriptor)
-
-// *INDENT-OFF*
-// clang-format off
-using NEHOGDescriptorFixture = HOGDescriptorValidationFixture<Tensor, HOG, Accessor, NEHOGDescriptor, uint8_t, float>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHOGDescriptorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(
-                       datasets::SmallHOGDescriptorDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance, tolerance_number);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHOGDescriptorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(
-                       datasets::LargeHOGDescriptorDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance, tolerance_number);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/HOGDetector.cpp b/tests/validation/NEON/HOGDetector.cpp
deleted file mode 100644
index dd138d75e3..0000000000
--- a/tests/validation/NEON/HOGDetector.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/NEON/HOGAccessor.h"
-#include "tests/datasets/HOGDescriptorDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HOGDetectorFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Set the tolerance (percentage) used when validating the score of detection window.
-   Note: High tolerance is required due to divergence between CL and NEON detection window scores. */
-RelativeTolerance<float> tolerance(1.0f);
-
-/* Input dataset (values must be a multiple of the HOGInfo block_size) */
-const auto DetectionWindowStrideDataset = framework::dataset::make("DetectionWindowStride", { Size2D(8, 8), Size2D(16, 16) });
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(HOGDetector)
-
-// *INDENT-OFF*
-// clang-format off
-using NEHOGDetectorFixture = HOGDetectorValidationFixture<Tensor,
-                                                          HOG,
-                                                          DetectionWindowArray,
-                                                          NEHOGDescriptor,
-                                                          Accessor,
-                                                          ArrayAccessor<DetectionWindow>,
-                                                          HOGAccessor,
-                                                          NEHOGDetector,
-                                                          uint8_t,
-                                                          float>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHOGDetectorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       DetectionWindowStrideDataset,
-                       datasets::SmallHOGDescriptorDataset()),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHOGDetectorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       DetectionWindowStrideDataset,
-                       datasets::LargeHOGDescriptorDataset()),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/HOGMultiDetection.cpp b/tests/validation/NEON/HOGMultiDetection.cpp
deleted file mode 100644
index ba51e371ae..0000000000
--- a/tests/validation/NEON/HOGMultiDetection.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/MultiHOG.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/NEON/HOGAccessor.h"
-#include "tests/datasets/HOGMultiDetectionDataset.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HOGMultiDetectionFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Set the tolerance (percentage) used when validating the strength of detection window. */
-RelativeTolerance<float> tolerance(1.0f);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(HOGMultiDetection)
-
-// *INDENT-OFF*
-// clang-format off
-using NEHOGMultiDetectionFixture = HOGMultiDetectionValidationFixture<Tensor,
-                                                                      HOG,
-                                                                      MultiHOG,
-                                                                      DetectionWindowArray,
-                                                                      Size2DArray,
-                                                                      Accessor,
-                                                                      ArrayAccessor<Size2D>,
-                                                                      ArrayAccessor<DetectionWindow>,
-                                                                      HOGAccessor,
-                                                                      NEHOGMultiDetection,
-                                                                      uint8_t,
-                                                                      float>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHOGMultiDetectionFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::SmallHOGMultiDetectionDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
-                       framework::dataset::make("NonMaximaSuppression", {false, true})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHOGMultiDetectionFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::LargeHOGMultiDetectionDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
-                       framework::dataset::make("NonMaximaSuppression", {false, true})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/HarrisCorners.cpp b/tests/validation/NEON/HarrisCorners.cpp
deleted file mode 100644
index 4c05c777b6..0000000000
--- a/tests/validation/NEON/HarrisCorners.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ImageFileDatasets.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HarrisCornersFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Allowed percentage of keypoints missing for target */
-const float allowed_missing_percentage = 10.f;
-/* Allowed percentage of keypoints mismatching between target and reference */
-const float allowed_mismatch_percentage = 10.f;
-
-const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }), combine(framework::dataset::make("BlockSize", { 3, 5, 7 }), datasets::BorderModes()));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(HarrisCorners)
-
-template <typename T>
-using NEHarrisCornersFixture = HarrisCornersValidationFixture<Tensor, Accessor, KeyPointArray, NEHarrisCorners, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHarrisCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallImageFiles(), data), framework::dataset::make("Format",
-                                                                                                           Format::U8)))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end(),
-                       RelativeTolerance<float>(0.0001f),
-                       allowed_missing_percentage,
-                       allowed_mismatch_percentage);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHarrisCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeImageFiles(), data), framework::dataset::make("Format", Format::U8)))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end(),
-                       RelativeTolerance<float>(0.0001f),
-                       allowed_missing_percentage,
-                       allowed_mismatch_percentage);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Histogram.cpp b/tests/validation/NEON/Histogram.cpp
deleted file mode 100644
index 906880251b..0000000000
--- a/tests/validation/NEON/Histogram.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/Distribution1D.h"
-#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HistogramFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Histogram)
-
-template <typename T>
-using NEHistogramFixture = HistogramValidationFixture<Tensor, Accessor, NEHistogram, T, Distribution1D>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                       DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                       DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp
index d218956884..ef5e75c5db 100644
--- a/tests/validation/NEON/Im2Col.cpp
+++ b/tests/validation/NEON/Im2Col.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
+#include "src/cpu/kernels/CpuIm2ColKernel.h"
 #include "tests/NEON/Accessor.h"
+#include "tests/NEON/Helper.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -39,6 +40,8 @@ namespace validation
 {
 namespace
 {
+const auto im2col_shapes = framework::dataset::make("Shape", { TensorShape{ 11U, 11U, 11U }, TensorShape{ 16U, 16U, 16U }, TensorShape{ 27U, 13U, 7U }, TensorShape{ 31U, 27U, 17U, 2U }, TensorShape{ 27U, 13U, 5U, 4U }, TensorShape{ 11U, 11U, 5U, 5U } });
+
 const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3U, 3U), Size2D(3U, 1U), Size2D(1U, 5U), Size2D(5U, 5U), Size2D(7U, 7U) });
 const auto conv_args         = combine(combine(combine(combine(conv_filter_sizes, framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) })),
                                                        framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
@@ -54,6 +57,8 @@ const auto conv_args_small         = combine(combine(combine(combine(conv_filter
 TEST_SUITE(NEON)
 TEST_SUITE(Im2Col)
 
+using CpuIm2Col = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuIm2ColKernel>;
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -73,26 +78,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Expected", { false, false, false, false, true })),
                input_info, output_info, has_bias, expected)
 {
-    bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
+    bool status = bool(cpu::kernels::CpuIm2ColKernel::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
 // *INDENT-ON*
 
 template <typename T>
-using NEIm2ColFixture = Im2ColValidationFixture<Tensor, Accessor, NEIm2Col, T, false>;
+using CpuIm2ColFixture = Im2ColOpValidationFixture<Tensor, Accessor, CpuIm2Col, T, false>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
-                                                                                                    conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                     conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType",
-                                                                                                          DataType::F32)),
-                                                                                                  conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                           DataType::F32)),
+                                                                                                   conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -102,15 +107,15 @@ TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
-                                                                                                   conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                    conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType",
-                                                                                                         DataType::F16)),
-                                                                                                 conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                          DataType::F16)),
+                                                                                                  conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -122,15 +127,15 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                      conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                       conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                            framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                    conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()),
+                                                                                                             framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                     conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -160,8 +165,8 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
     Tensor dst_target = create_tensor<Tensor>(dst_shape, data_type, 1, qinfo);
 
     // Configure target function
-    NEIm2Col im2col_func;
-    im2col_func.configure(&src_target, &dst_target, spatial_kernel, conv_info, has_bias);
+    CpuIm2Col im2col_func;
+    im2col_func.configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias);
 
     // Extend padding
     src_target.info()->extend_padding(PaddingSize(3, 5, 9, 1));
@@ -180,8 +185,13 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
     // Fill target source
     library->fill_tensor_uniform(Accessor(src_target), 0);
 
+    ITensorPack pack =
+    {
+        { TensorType::ACL_SRC, &src_target },
+        { TensorType::ACL_DST, &dst_target }
+    };
     // Run target function
-    im2col_func.run();
+    im2col_func.run(pack);
 
     // Calculate Reference
     SimpleTensor<float> src_ref{ src_shape, data_type, 1, qinfo, data_layout };
@@ -200,7 +210,7 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
 }
 TEST_SUITE_END() // Special Cases
 TEST_SUITE_END() // Im2Col
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/InstanceNormalizationLayer.cpp b/tests/validation/NEON/InstanceNormalizationLayer.cpp
index 1073a7f6f7..593ef0ad99 100644
--- a/tests/validation/NEON/InstanceNormalizationLayer.cpp
+++ b/tests/validation/NEON/InstanceNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,7 +123,7 @@ TEST_SUITE_END() // FP16
 #endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE_END() // InstanceNormalizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/IntegralImage.cpp b/tests/validation/NEON/IntegralImage.cpp
deleted file mode 100644
index a29f7423c1..0000000000
--- a/tests/validation/NEON/IntegralImage.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/IntegralImageFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(IntegralImage)
-
-template <typename T>
-using NEIntegralImageFixture = IntegralImageValidationFixture<Tensor, Accessor, NEIntegralImage, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIntegralImageFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIntegralImageFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/L2NormalizeLayer.cpp b/tests/validation/NEON/L2NormalizeLayer.cpp
index 82e4beb05a..56223c44d5 100644
--- a/tests/validation/NEON/L2NormalizeLayer.cpp
+++ b/tests/validation/NEON/L2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -139,7 +139,7 @@ TEST_SUITE_END() // FP16
 #endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE_END() // L2NormalizeLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/LSTMLayer.cpp b/tests/validation/NEON/LSTMLayer.cpp
index 0850dc6954..c4645563bf 100644
--- a/tests/validation/NEON/LSTMLayer.cpp
+++ b/tests/validation/NEON/LSTMLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -181,7 +181,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NELSTMLayerFixture<half>, framework::DatasetMod
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // LSTMLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/LSTMLayerQuantized.cpp b/tests/validation/NEON/LSTMLayerQuantized.cpp
index fbcece8ffc..6b98ee2b67 100644
--- a/tests/validation/NEON/LSTMLayerQuantized.cpp
+++ b/tests/validation/NEON/LSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,11 +64,7 @@ inline void fill_tensor(SimpleTensor<T> &tensor, const std::vector<T> &v)
 }
 
 /** Tolerance for quantized asymmetric operations */
-#if defined(__aarch64__)
-constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(0);
-#else  // defined(__aarch64__)
 constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(1);
-#endif // defined(__aarch64__)
 
 } // namespace
 
@@ -602,7 +598,7 @@ TEST_SUITE_END() // IntegrationTestCase
 // *INDENT-ON*
 
 TEST_SUITE_END() // LSTMLayerQuantized
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/LaplacianPyramid.cpp b/tests/validation/NEON/LaplacianPyramid.cpp
deleted file mode 100644
index b5215e0fa1..0000000000
--- a/tests/validation/NEON/LaplacianPyramid.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/LaplacianPyramidFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto small_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 3);
-const auto large_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 5);
-
-const auto formats = combine(framework::dataset::make("FormatIn", Format::U8), framework::dataset::make("FormatOut", Format::S16));
-
-template <typename T>
-inline void validate_laplacian_pyramid(const Pyramid &target, const std::vector<SimpleTensor<T>> &reference, BorderMode border_mode)
-{
-    Tensor     *level_image  = target.get_pyramid_level(0);
-    ValidRegion valid_region = shape_to_valid_region(reference[0].shape(), border_mode == BorderMode::UNDEFINED, BorderSize(2));
-
-    // Validate lowest level
-    validate(Accessor(*level_image), reference[0], valid_region);
-
-    // Validate remaining levels
-    for(size_t lev = 1; lev < target.info()->num_levels(); lev++)
-    {
-        level_image              = target.get_pyramid_level(lev);
-        Tensor *prev_level_image = target.get_pyramid_level(lev - 1);
-
-        valid_region = shape_to_valid_region_laplacian_pyramid(prev_level_image->info()->tensor_shape(),
-                                                               prev_level_image->info()->valid_region(),
-                                                               border_mode == BorderMode::UNDEFINED);
-
-        // Validate level
-        validate(Accessor(*level_image), reference[lev], valid_region);
-    }
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(LaplacianPyramid)
-
-// *INDENT-OFF*
-// clang-format off
-
-using NELaplacianPyramidFixture = LaplacianPyramidValidationFixture<Tensor, Accessor, NELaplacianPyramid, uint8_t, int16_t, Pyramid>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NELaplacianPyramidFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Medium2DShapes(),
-                       datasets::BorderModes()),
-                       small_laplacian_pyramid_levels),
-                       formats))
-{
-    validate_laplacian_pyramid(_target, _reference, _border_mode);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NELaplacianPyramidFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Large2DShapes(),
-                       datasets::BorderModes()),
-                       large_laplacian_pyramid_levels),
-                       formats))
-{
-    validate_laplacian_pyramid(_target, _reference, _border_mode);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/LaplacianReconstruct.cpp b/tests/validation/NEON/LaplacianReconstruct.cpp
deleted file mode 100644
index b679d365b4..0000000000
--- a/tests/validation/NEON/LaplacianReconstruct.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
-#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
-#include "arm_compute/runtime/Pyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/LaplacianReconstructFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto small_laplacian_reconstruct_levels = framework::dataset::make("NumLevels", 2, 3);
-const auto large_laplacian_reconstruct_levels = framework::dataset::make("NumLevels", 2, 5);
-
-const auto formats = combine(framework::dataset::make("FormatIn", Format::S16), framework::dataset::make("FormatOut", Format::U8));
-
-template <typename T>
-void validate_laplacian_reconstruct(Tensor &target, const SimpleTensor<T> &reference, BorderMode border_mode, size_t num_levels)
-{
-    const unsigned int filter_size = 5;
-    const unsigned int border_size(filter_size / 2);
-
-    BorderSize border(std::pow(border_size, num_levels));
-
-    // Validate output
-    ValidRegion valid_region = shape_to_valid_region(reference.shape(), border_mode == BorderMode::UNDEFINED, border);
-    validate(Accessor(target), reference, valid_region);
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(LaplacianReconstruct)
-
-// *INDENT-OFF*
-// clang-format off
-
-using NELaplacianReconstructFixture = LaplacianReconstructValidationFixture<Tensor, Accessor, NELaplacianReconstruct, NELaplacianPyramid, int16_t, uint8_t, Pyramid>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NELaplacianReconstructFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Medium2DShapes(),
-                       datasets::BorderModes()),
-                       small_laplacian_reconstruct_levels),
-                       formats))
-{
-    validate_laplacian_reconstruct(_target, _reference, _border_mode, _pyramid_levels);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NELaplacianReconstructFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Large2DShapes(),
-                       datasets::BorderModes()),
-                       large_laplacian_reconstruct_levels),
-                       formats))
-{
-    validate_laplacian_reconstruct(_target, _reference, _border_mode, _pyramid_levels);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Logical.cpp b/tests/validation/NEON/Logical.cpp
index 6f1c55b33c..271d22f877 100644
--- a/tests/validation/NEON/Logical.cpp
+++ b/tests/validation/NEON/Logical.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -85,7 +85,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NELogicalNotFixture<uint8_t>, framework::Datase
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // LogicalNot
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/NEON/Magnitude.cpp b/tests/validation/NEON/Magnitude.cpp
deleted file mode 100644
index 76429af211..0000000000
--- a/tests/validation/NEON/Magnitude.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/MagnitudeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-template <typename T>
-AbsoluteTolerance<T> tolerance(MagnitudeType magnitude_type)
-{
-    return AbsoluteTolerance<T>((MagnitudeType::L1NORM == magnitude_type) ? 0 : 1);
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Magnitude)
-
-template <typename T>
-using NEMagnitudeFixture = MagnitudeValidationFixture<Tensor, Accessor, NEMagnitude, T>;
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMagnitudeFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Small2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                       framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
-
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance<int16_t>(_magnitude_type));
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMagnitudeFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Large2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                       framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance<int16_t>(_magnitude_type));
-}
-TEST_SUITE_END() // S16
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/MatMul.cpp b/tests/validation/NEON/MatMul.cpp
new file mode 100644
index 0000000000..f22bd9e86a
--- /dev/null
+++ b/tests/validation/NEON/MatMul.cpp
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/validation/fixtures/MatMulFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using framework::dataset::make;
+
+TEST_SUITE(NEON)
+TEST_SUITE(MatMul)
+
+constexpr AbsoluteTolerance<float> tolerance_fp32(
+    0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */
+const AbsoluteTolerance<half> tolerance_fp16(half(0.1f));
+#ifdef __aarch64__
+constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8(1);
+constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8_signed(1);
+#endif // __aarch64__
+
+// clang-format off
+// *INDENT-OFF*
+// Validation Tests
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL,
+    zip(
+        make("InputAInfo", {
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Mismatching datatype
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::S32),        // Unsupported datatypes
+            TensorInfo(TensorShape(9U, 6U, 2U), 1, DataType::F32),    // Broadcasting in batch dimension not supported
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Invalid shape for multiplication
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32), // Tensors are not dynamic
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED), // Mismatching data type
+        }),
+        make("InputBInfo", {
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 9U, 1U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+        }),
+        make("OutputInfo", {
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 6U, 2U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+        }),
+        make("TensorIsConst", {false, false, false, false, false , false, true, false, false, false}),
+        make("Expected", { false, false, false, false, true, true, false, true, true, false })),
+    a_info, b_info, output_info, are_tensors_const, expected)
+{
+    TensorInfo a{a_info};
+    TensorInfo b{b_info};
+    a.set_are_values_constant(are_tensors_const);
+    b.set_are_values_constant(are_tensors_const);
+    Status status =  NEMatMul::validate(&a,
+                                        &b,
+                                        &output_info,
+                                        MatMulInfo(),
+                                        CpuMatMulSettings());
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+// *INDENT-ON*
+// clang-format on
+
+// Generic Template
+template <typename T>
+using NEMatMulFixture = MatMulValidationWithActivationFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+// Fast math Template
+template <typename T>
+using NEMatMulFastMathFixture = MatMulGenericValidationFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+template <typename T>
+using NEMatMulFixedFormatFixture = MatMulFixedFormatFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+template <typename T>
+using NEMatMulDynamicTensorsFixture =
+    MatMulValidationWithDynamicTensorsFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+template <typename T>
+using NEQuantizedMatMulFixture = QuantizedMatMulValidationFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEMatMulFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEMatMulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+FIXTURE_DATA_TEST_CASE(RunHighDimensions,
+                       NEMatMulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::HighDimensionalMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors,
+                       NEMatMulDynamicTensorsFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfRuns", 5)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // FP32
+
+#ifdef ARM_COMPUTE_ENABLE_BF16
+/* Note : MatMul BF16 is enabled by specifying FP32 datatype and enabling the fast math setting */
+constexpr AbsoluteTolerance<float> tolerance_bf16(0.02f);
+TEST_SUITE(BF16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEMatMulFastMathFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo", {ActivationLayerInfo()}),
+                               make("RunTimes", {0}),
+                               make("Settings", {CpuMatMulSettings().fast_math(true)}),
+                               make("LhsQInfo", {QuantizationInfo()}),
+                               make("RhsQInfo", {QuantizationInfo()}),
+                               make("OutQInfo", {QuantizationInfo()})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_bf16);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+FIXTURE_DATA_TEST_CASE(RunTinyFixedFormat,
+                       NEMatMulFixedFormatFixture<bfloat16>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::TinyMatMulDataset(),
+                               make("TransposeA", {false}),
+                               make("TransposeB", {false}),
+                               make("DataType", DataType::BFLOAT16),
+                               make("ActivationInfo", {ActivationLayerInfo()}),
+                               make("RunTimes", {0}),
+                               make("Settings", {CpuMatMulSettings().fast_math(true).fixed_format(true)}),
+                               make("LhsQInfo", {QuantizationInfo()}),
+                               make("RhsQInfo", {QuantizationInfo()}),
+                               make("OutQInfo", {QuantizationInfo()})))
+{
+    if (CPUInfo::get().has_bf16())
+    {
+        // Validate output
+        validate(Accessor(_target), _reference, tolerance_bf16);
+    }
+}
+#endif /* ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS */
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEMatMulFastMathFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo", {ActivationLayerInfo()}),
+                               make("RunTimes", {0}),
+                               make("Settings", {CpuMatMulSettings().fast_math(true)}),
+                               make("LhsQInfo", {QuantizationInfo()}),
+                               make("RhsQInfo", {QuantizationInfo()}),
+                               make("OutQInfo", {QuantizationInfo()})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_bf16, 0.01 /* tolerance_num */);
+}
+TEST_SUITE_END() // BF16
+#endif           /* ARM_COMPUTE_ENABLE_BF16 */
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEMatMulFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F16),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEMatMulFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F16),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors,
+                       NEMatMulDynamicTensorsFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F16),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfRuns", 5)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE_END() // Float
+
+#ifdef __aarch64__ // All the GeMM CPU assembly kernels for integer datatypes require aarch64
+TEST_SUITE(Quantized)
+
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEQuantizedMatMulFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 30, -1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 2)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation,
+                       NEQuantizedMatMulFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::SmallerMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 30, -1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 2)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEQuantizedMatMulFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 100, 1)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 200, -1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 2)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEQuantizedMatMulFixture<int8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8_SIGNED),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 40, -2)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 1)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation,
+                       NEQuantizedMatMulFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::SmallerMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8_SIGNED),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 40, -2)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 1)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEQuantizedMatMulFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8_SIGNED),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 150, -2)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 250, 1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 1)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
+#endif           // __aarch64__
+
+TEST_SUITE_END() // MatMul
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/MaxUnpoolingLayer.cpp b/tests/validation/NEON/MaxUnpoolingLayer.cpp
index a33ec2885e..0eb021fe71 100644
--- a/tests/validation/NEON/MaxUnpoolingLayer.cpp
+++ b/tests/validation/NEON/MaxUnpoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,10 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
@@ -33,7 +35,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/MaxUnpoolingLayerFixture.h"
-
 namespace arm_compute
 {
 namespace test
@@ -51,7 +52,7 @@ const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(framework::datase
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
                                                                                                                    framework::dataset::make("DataType", DataType::F32))),
                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
 
@@ -63,7 +64,7 @@ FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<float>, framewor
 TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
                                                                                                                   framework::dataset::make("DataType", DataType::F16))),
                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
 
@@ -74,9 +75,39 @@ FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<half>, framework
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
 TEST_SUITE_END() // Float
+
+TEST_SUITE(KernelSelection)
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuMaxUnpoolingLayerKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_maxunpooling";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // KernelSelection
 TEST_SUITE_END() // PoolingLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/MeanStdDev.cpp b/tests/validation/NEON/MeanStdDev.cpp
deleted file mode 100644
index 8c254b2633..0000000000
--- a/tests/validation/NEON/MeanStdDev.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/MeanStdDevFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-RelativeTolerance<float> tolerance_rel_high_error(0.05f);
-RelativeTolerance<float> tolerance_rel_low_error(0.0005f);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(MeanStdDev)
-
-template <typename T>
-using NEMeanStdDevFixture = MeanStdDevValidationFixture<Tensor, Accessor, NEMeanStdDev, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                        DataType::U8)))
-{
-    // Validate mean output
-    validate(_target.first, _reference.first);
-
-    // Validate std_dev output
-    validate(_target.second, _reference.second, tolerance_rel_high_error);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMeanStdDevFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                        DataType::U8)))
-{
-    // Validate mean output
-    validate(_target.first, _reference.first, tolerance_rel_low_error);
-
-    // Validate std_dev output
-    validate(_target.second, _reference.second, tolerance_rel_high_error);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
index 11f97a5e84..085f3608a0 100644
--- a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,8 @@ namespace
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<half> tolerance_f16(half(0.2f));
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-RelativeTolerance<float> tolerance_f32(1e-8f);
+RelativeTolerance<float>   tolerance_f32(1e-4f);
+RelativeTolerance<uint8_t> tolerance_qasymm8(1);
 } // namespace
 
 TEST_SUITE(NEON)
@@ -81,7 +82,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F16)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -101,7 +102,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-7 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -114,11 +115,25 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEMeanStdDevNormalizationLayerFixture<float>, f
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InPlace", { false, true })),
+                       framework::dataset::make("Epsilon", { 1e-7 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // QASYMM8
+
 TEST_SUITE_END() // MeanStdNormalizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Median3x3.cpp b/tests/validation/NEON/Median3x3.cpp
deleted file mode 100644
index 0fac6511f6..0000000000
--- a/tests/validation/NEON/Median3x3.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Median3x3Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Median3x3)
-
-template <typename T>
-using NEMedian3x3Fixture = Median3x3ValidationFixture<Tensor, Accessor, NEMedian3x3, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMedian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)),
-                                                                                                       datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMedian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)),
-                                                                                                       datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/MinMaxLocation.cpp b/tests/validation/NEON/MinMaxLocation.cpp
deleted file mode 100644
index 2f9cd3dc77..0000000000
--- a/tests/validation/NEON/MinMaxLocation.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/MinMaxLocationFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(MinMaxLocation)
-
-template <typename T>
-using NEMinMaxLocationFixture = MinMaxLocationValidationFixture<Tensor, Accessor, Array<Coordinates2D>, ArrayAccessor<Coordinates2D>, NEMinMaxLocation, T>;
-
-TEST_SUITE(U8)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMinMaxLocationFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMinMaxLocationFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMinMaxLocationFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::S16)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMinMaxLocationFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::S16)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-TEST_SUITE_END() // S16
-
-TEST_SUITE(Float)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMinMaxLocationFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                          DataType::F32)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMinMaxLocationFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                          DataType::F32)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-TEST_SUITE_END() // F32
-
-TEST_SUITE_END() // MinMaxLocation
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/NonLinearFilter.cpp b/tests/validation/NEON/NonLinearFilter.cpp
deleted file mode 100644
index 4b2ad2796a..0000000000
--- a/tests/validation/NEON/NonLinearFilter.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/MatrixPatternDataset.h"
-#include "tests/datasets/NonLinearFilterFunctionDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/NonLinearFilterFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(NonLinearFilter)
-
-template <typename T>
-using NENonLinearFilterFixture = NonLinearFilterValidationFixture<Tensor, Accessor, NENonLinearFilter, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NENonLinearFilterFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     datasets::NonLinearFilterFunctions()),
-                                                                                                                     framework::dataset::make("MaskSize", { 3U, 5U })),
-                                                                                                                     datasets::MatrixPatterns()),
-                                                                                                                     datasets::BorderModes()),
-                                                                                                             framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), _border_size));
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NENonLinearFilterFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                     datasets::NonLinearFilterFunctions()),
-                                                                                                                     framework::dataset::make("MaskSize", { 3U, 5U })),
-                                                                                                                     datasets::MatrixPatterns()),
-                                                                                                                     datasets::BorderModes()),
-                                                                                                             framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), _border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/NormalizationLayer.cpp b/tests/validation/NEON/NormalizationLayer.cpp
index d910cbdc45..7260d1044e 100644
--- a/tests/validation/NEON/NormalizationLayer.cpp
+++ b/tests/validation/NEON/NormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -126,7 +126,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // NormalizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/OpticalFlow.cpp b/tests/validation/NEON/OpticalFlow.cpp
deleted file mode 100644
index f0bc47e804..0000000000
--- a/tests/validation/NEON/OpticalFlow.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
-#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
-#include "arm_compute/runtime/Pyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/OpticalFlowDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/OpticalFlowFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(OpticalFlow)
-
-// *INDENT-OFF*
-// clang-format off
-using NEOpticalFlowFixture = OpticalFlowValidationFixture<Tensor,
-                                                          Accessor,
-                                                          KeyPointArray,
-                                                          ArrayAccessor<KeyPoint>,
-                                                          NEOpticalFlow,
-                                                          Pyramid,
-                                                          NEGaussianPyramidHalf,
-                                                          uint8_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEOpticalFlowFixture, framework::DatasetMode::NIGHTLY, combine(combine(
-                       datasets::SmallOpticalFlowDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       datasets::BorderModes()))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end());
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEOpticalFlowFixture, framework::DatasetMode::NIGHTLY, combine(combine(
-                       datasets::LargeOpticalFlowDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       datasets::BorderModes()))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end());
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/PReluLayer.cpp b/tests/validation/NEON/PReluLayer.cpp
index c4b3c88787..69a05175ad 100644
--- a/tests/validation/NEON/PReluLayer.cpp
+++ b/tests/validation/NEON/PReluLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -206,7 +206,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // PReluLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/PadLayer.cpp b/tests/validation/NEON/PadLayer.cpp
index ea9ef72e13..4947af3423 100644
--- a/tests/validation/NEON/PadLayer.cpp
+++ b/tests/validation/NEON/PadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -188,7 +188,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // PadLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Phase.cpp b/tests/validation/NEON/Phase.cpp
deleted file mode 100644
index 6d939b357c..0000000000
--- a/tests/validation/NEON/Phase.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEPhase.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/PhaseFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Phase)
-
-template <typename T>
-using NEPhaseFixture = PhaseValidationFixture<Tensor, Accessor, NEPhase, T>;
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPhaseFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Small2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                   framework::dataset::make("PhaseType", { PhaseType::UNSIGNED, PhaseType::SIGNED })))
-{
-    // Validate output
-    validate_wrap(Accessor(_target), _reference, tolerance_value, 0);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPhaseFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Large2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                   framework::dataset::make("PhaseType", { PhaseType::UNSIGNED, PhaseType::SIGNED })))
-{
-    // Validate output
-    validate_wrap(Accessor(_target), _reference, tolerance_value, 0);
-}
-TEST_SUITE_END() // S16
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 1bb0588919..964d1c5deb 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,11 @@ const auto PixelWiseMultiplicationQASYMM8QuantDataset = combine(combine(
                                                                     framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
                                                                 framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 32768.f, 0) }));
 
+const auto PixelWiseMultiplicationQASYMM8QuantInPlaceDataset = combine(combine(
+                                                                           framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 32768.f, 10) }),
+                                                                           framework::dataset::make("Src1QInfo", { QuantizationInfo(5.f / 32768.f, 10) })),
+                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 10) }));
+
 const auto PixelWiseMultiplicationPolicySTNUDataset = combine(
                                                           framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE }),
                                                           framework::dataset::make("RoundingPolicy", { RoundingPolicy::TO_NEAREST_UP }));
@@ -75,7 +80,8 @@ const auto PixelWiseMultiplicationPolicySTZDataset = combine(
  * expected to have either different quantization information, data type
  * or different shape we are not testing in-place computation.
  */
-const auto InPlaceDataSet = framework::dataset::make("InPlace", { false, true });
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 
 #define DEFAULT_VALIDATE validate(Accessor(_target), _reference);
 #define VALIDATE(TYPE, TOLERANCE) validate(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
@@ -275,7 +281,19 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture,
                                                                                                                      framework::dataset::make("Scale", { scale_unity })),
                                                                                                                      PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                      PixelWiseMultiplicationQASYMM8QuantDataset),
-                                                                                                                     InPlaceDataSet))
+                                                                                                                     OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallInPlace, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataTypeOut", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("Scale", { scale_unity })),
+                       PixelWiseMultiplicationPolicySTZDataset),
+                       PixelWiseMultiplicationQASYMM8QuantInPlaceDataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -292,7 +310,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_255 })),
                                                                                                                        PixelWiseMultiplicationPolicySTNUDataset),
                                                                                                                        PixelWiseMultiplicationQASYMM8QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -306,7 +324,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_unity })),
                                                                                                                        PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                        PixelWiseMultiplicationQASYMM8QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -320,7 +338,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_other })),
                                                                                                                        PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                        PixelWiseMultiplicationQASYMM8QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -335,7 +353,20 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastQASYMM8Fixtur
                                                        framework::dataset::make("Scale", { scale_other })),
                                                PixelWiseMultiplicationPolicySTZDataset),
                                        PixelWiseMultiplicationQASYMM8QuantDataset),
-                               framework::dataset::make("InPlace", { false })))
+                               OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyInPlace, NEPixelWiseMultiplicationBroadcastQASYMM8Fixture, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                                               framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                               framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                       framework::dataset::make("Scale", { scale_other })),
+                                               PixelWiseMultiplicationPolicySTZDataset),
+                                       PixelWiseMultiplicationQASYMM8QuantInPlaceDataset),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -351,7 +382,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_255 })),
                                                                                                                        PixelWiseMultiplicationPolicySTNUDataset),
                                                                                                                        PixelWiseMultiplicationQSYMM16QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -365,7 +396,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_unity })),
                                                                                                                        PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                        PixelWiseMultiplicationQSYMM16QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -379,7 +410,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framew
                                                                                                                        framework::dataset::make("Scale", { scale_other })),
                                                                                                                        PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                        PixelWiseMultiplicationQSYMM16QuantDataset),
-                                                                                                               InPlaceDataSet))
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -394,7 +425,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, f
                                                                                                                     framework::dataset::make("Scale", { scale_unity })),
                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
                                                                                                                     PixelWiseMultiplicationQSYMM16QuantDataset),
-                                                                                                                    framework::dataset::make("InPlace", { false })))
+                                                                                                                    OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -411,7 +442,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationU8U8ToS16Fixture, fram
                                                                                                                        framework::dataset::make("Scale", { scale_255 })),
                                                                                                                        datasets::ConvertPolicies()),
                                                                                                                        framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP)),
-                                                                                                                       framework::dataset::make("InPlace", { false })))
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<int16_t>(1), 0.f);
@@ -451,17 +482,17 @@ TEST_SUITE_END() // U8toU8
 TEST_SUITE(U8toS16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_255, TO_NEAREST_UP, framework::dataset::make("InPlace", { false }),
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_255, TO_NEAREST_UP, OutOfPlaceDataSet,
                                                  WRAP_VALIDATE(int16_t, 2))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_unity, TO_ZERO, framework::dataset::make("InPlace", { false }),
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_unity, TO_ZERO, OutOfPlaceDataSet,
                                                  DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_other, TO_ZERO, framework::dataset::make("InPlace", { false }),
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_other, TO_ZERO, OutOfPlaceDataSet,
                                                  DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
diff --git a/tests/validation/NEON/Pooling3dLayer.cpp b/tests/validation/NEON/Pooling3dLayer.cpp
new file mode 100644
index 0000000000..07054462f5
--- /dev/null
+++ b/tests/validation/NEON/Pooling3dLayer.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/Pooling3dLayerDataset.h"
+#include "tests/datasets/PoolingTypesDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/Pooling3dLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Input data sets for floating-point data types */
+const auto Pooling3dLayerDatasetFP = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 3, 2) })),
+                                                             framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(2, 2, 1) })),
+                                                     framework::dataset::make("Padding", { Padding3D(0, 1, 0), Padding3D(1, 1, 1) })),
+                                             framework::dataset::make("ExcludePadding", { true, false }));
+
+const auto Pooling3dLayerDatasetFPSmall = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 2, 2), Size3D(3, 3, 3) })),
+                                                                  framework::dataset::make("Stride", { Size3D(2, 2, 2), Size3D(2, 1, 1) })),
+                                                          framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                  framework::dataset::make("ExcludePadding", { true, false }));
+
+const auto Pooling3dLayerDatasetQASYMM8Small = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                               framework::dataset::make("PoolingSize", { Size3D(3, 3, 3) })),
+                                                                       framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(2, 2, 1) })),
+                                                               framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                       framework::dataset::make("ExcludePadding", { true }));
+
+const auto Pooling3dLayerDatasetQASYMM8Large = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                               framework::dataset::make("PoolingSize", { Size3D(3, 3, 3) })),
+                                                                       framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 2, 1) })),
+                                                               framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 0) })),
+                                                       framework::dataset::make("ExcludePadding", { true }));
+
+using ShapeDataset = framework::dataset::ContainerDataset<std::vector<TensorShape>>;
+
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);     /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+#endif                                                       /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_s(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric type */
+
+const auto qasymm8_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, 10) });
+const auto qasymm8_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, 10), // Same qinfo
+    QuantizationInfo(.1f, 5),  // Multiplier <= 1
+    QuantizationInfo(2.f, 3)   // Multiplier > 1
+});
+
+const auto qasymm8_signed_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, -10) });
+const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, -10), // Same qinfo
+    QuantizationInfo(.1f, -5),  // Multiplier <= 1
+    QuantizationInfo(2.f, -3)   // Multiplier > 1
+});
+
+} //namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(Pooling3dLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Mismatching data type
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid pad/size combination
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid pad/size combination
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid output shape
+                                            TensorInfo(TensorShape(5U, 13U, 15U, 2U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Global Pooling
+                                            TensorInfo(TensorShape(13U,13U, 5U, 1U, 2U),  1, DataType::F32, DataLayout::NDHWC),     // Invalid output Global Pooling
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid data type
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NHWC),      // Invalid data layout
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 5U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(1U, 16U,  1U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                          }),
+    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(2U, 25U, 11U, 3U, 3U), 1, DataType::F16, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 30U, 11U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 25U, 16U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 3U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U,  1U,  1U, 1U, 3U), 1, DataType::F32, DataLayout::NDHWC),            // Global pooling applied
+                                            TensorInfo(TensorShape(5U,  2U,  2U, 2U, 2U), 1, DataType::F32, DataLayout::NDHWC),            // Invalid output Global Pooling
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::QASYMM8, DataLayout::NDHWC),        // Invalid data type
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),            // Invalid data layout
+                                            TensorInfo(TensorShape(5U,  1U,  1U, 1U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(1U, 15U, 1U, 2U, 4U), 1, DataType::F32, DataLayout::NDHWC),             // size larger than height
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 2U),  1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                    })),
+    framework::dataset::make("PoolInfo",  { Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(2, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::L2,  3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG),
+                                            Pooling3dLayerInfo(PoolingType::MAX),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1U, 1U, 1U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1U, 1U, 1U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG),
+                                            Pooling3dLayerInfo(PoolingType::MAX, 2, Size3D(1, 1, 2), Padding3D(0, 0, 0), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(2U, 2U, 2U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), true),  // pool size is equal to the padding size
+                                            Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), false), // pool size is equal to the padding size
+                                            Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(2U, 2U, 2U), Padding3D(2,1,2,2,1,2), false, false, DimensionRoundingType::CEIL), // CEIL with asymmetric Padding
+                                            })),
+    framework::dataset::make("Expected", { false, false, false, false, true, false, false, false, false, true , false, true, false, false, false})),
+    input_info, output_info, pool_info, expected)
+{
+    bool is_valid = bool(NEPooling3dLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEPoolingLayer3dFixture = Pooling3dLayerValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+template <typename T>
+using NESpecial3dPoolingLayerFixture = SpecialPooling3dLayerValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+template <typename T>
+using NEPooling3dLayerGlobalFixture = Pooling3dLayerGlobalValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSpecial, NESpecial3dPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::Pooling3dLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5dShapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                            framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFPSmall, framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunGlobalSmall, NEPooling3dLayerGlobalFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5x5Shapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                           framework::dataset::make("DataType", DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP,
+                                                                                                           framework::dataset::make("DataType",
+                                                                                                                   DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunSmallGlobal, NEPooling3dLayerGlobalFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+TEST_SUITE_END() // Float
+TEST_SUITE(Quantized)
+
+template <typename T>
+using NEPooling3dLayerQuantizedFixture = Pooling3dLayerValidationQuantizedFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                       combine(Pooling3dLayerDatasetQASYMM8Small,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       qasymm8_in_qinfo_dataset),
+                                                                                                                       qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(),
+                                                                                                                       combine(Pooling3dLayerDatasetQASYMM8Large,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       qasymm8_in_qinfo_dataset),
+                                                                                                                       qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPooling3dLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                      combine(Pooling3dLayerDatasetQASYMM8Small,
+                                                                                                                              framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                      qasymm8_signed_in_qinfo_dataset),
+                                                                                                                      qasymm8_signed_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // Pooling3dLayer
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 0a6a5a18b5..161fe627cc 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/PoolingLayerDataset.h"
@@ -54,7 +53,7 @@ const auto PoolingLayerDatasetFPSmall = combine(combine(combine(datasets::Poolin
 
 /** Input data sets for asymmetric data type */
 
-const auto PoolingLayerDatasetQASYMM8Small = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(3, 7) })),
+const auto PoolingLayerDatasetQASYMM8Small = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(3, 7), Size2D(7, 7) })),
                                                              framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(1, 2, 1, 1) })),
                                                      framework::dataset::make("ExcludePadding", { true }));
 
@@ -66,6 +65,29 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance valu
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_s(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric type */
 const auto                           pool_data_layout_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
 
+const auto qasymm8_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, 10) });
+const auto qasymm8_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, 10), // Same qinfo
+    QuantizationInfo(.1f, 5),  // Multiplier <= 1
+    QuantizationInfo(2.f, 3)   // Multiplier > 1
+});
+
+const auto qasymm8_signed_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, -10) });
+const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, -10), // Same qinfo
+    QuantizationInfo(.1f, -5),  // Multiplier <= 1
+    QuantizationInfo(2.f, -3)   // Multiplier > 1
+});
+
+// Cases where pooling region is completely outside the input tensor (excluding global pooling)
+const auto pool_outside_input_dataset = zip(zip(zip(zip(
+                                                        framework::dataset::make("Shape", { TensorShape{ 2U, 2U, 1U }, TensorShape{ 2U, 2U, 4U }, TensorShape{ 3U, 5U, 2U }, TensorShape{ 10U, 20U, 3U } }),
+                                                        framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                                    framework::dataset::make("PoolingSize", { Size2D{ 2, 2 }, Size2D{ 3, 3 }, Size2D{ 2, 2 }, Size2D{ 3, 6 } })),
+                                                framework::dataset::make("PadStride", { PadStrideInfo{ 1, 1, 2, 2 }, PadStrideInfo{ 1, 1, 4, 4 }, PadStrideInfo{ 1, 1, 3, 3 }, PadStrideInfo{ 1, 1, 2, 5 } })),
+                                            framework::dataset::make("ExcludePadding", { false, false, false, false }));
 } // namespace
 
 TEST_SUITE(NEON)
@@ -82,7 +104,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),     // Invalid output Global Pooling
                                             TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::QASYMM8), // Invalid exclude_padding = false with quantized type, no actual padding and NHWC
                                             TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),
-                                          }),
+                                            TensorInfo(TensorShape(1U, 16U, 1U),  1, DataType::F32),
+                                            TensorInfo(TensorShape(112, 112, 64,1), 1, DataType::F32, DataLayout::NHWC), // Mismatching number of channels
+                                            TensorInfo(TensorShape(112, 112, 64,1), 1, DataType::F32, DataLayout::NHWC), // Mismatching width
+                                         }),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(30U, 11U, 2U), 1, DataType::F32),
@@ -91,7 +116,11 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             TensorInfo(TensorShape(2U, 2U, 5U), 1, DataType::F32),
                                             TensorInfo(TensorShape(12U, 12U, 5U), 1, DataType::QASYMM8),
                                             TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                          })),
+                                            TensorInfo(TensorShape(1U, 15U, 1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(56, 56, 64,1), 1, DataType::F32, DataLayout::NHWC),
+                                            TensorInfo(TensorShape(56, 51, 64,1), 1, DataType::F32, DataLayout::NHWC),
+
+                                           })),
     framework::dataset::make("PoolInfo",  { PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
                                             PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
                                             PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NCHW, PadStrideInfo(1, 1, 2, 0)),
@@ -100,8 +129,12 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             PoolingLayerInfo(PoolingType::MAX, DataLayout::NCHW),
                                             PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NHWC, PadStrideInfo(), false),
                                             PoolingLayerInfo(PoolingType::AVG, DataLayout::NCHW),
+                                            PoolingLayerInfo(PoolingType::MAX, 2, DataLayout::NHWC, PadStrideInfo(1, 1, 0, 0), false),
+                                            PoolingLayerInfo(PoolingType::MAX,3,DataLayout::NHWC,PadStrideInfo(2,2,1,1)),
+                                            PoolingLayerInfo(PoolingType::MAX,3,DataLayout::NHWC,PadStrideInfo(2,2,1,1)),
+
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, true, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, false, true, false, true, false, false, false, false})),
     input_info, output_info, pool_info, expected)
 {
     bool is_valid = bool(NEPoolingLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info));
@@ -115,6 +148,8 @@ using NEPoolingLayerIndicesFixture = PoolingLayerIndicesValidationFixture<Tensor
 
 template <typename T>
 using NEPoolingLayerFixture = PoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T>;
+template <typename T>
+using NEPoolingLayerMixedDataLayoutFixture = PoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T, true>;
 
 template <typename T>
 using NESpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T>;
@@ -122,27 +157,37 @@ using NESpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<Tensor
 const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
                                                                framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0) })),
                                                        framework::dataset::make("ExcludePadding", { true, false }));
-
+const auto PoolingLayerKernelIndicesDatasetFPSmall = combine(combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(7, 7) })),
+                                                                     framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 1, 1, 1) })),
+                                                             framework::dataset::make("ExcludePadding", { false }));
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
-                                                                                                                   framework::dataset::make("DataType",
-                                                                                                                           DataType::F32))),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
-
-                                                                                                                  ))
+FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                   combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                           framework::dataset::make("DataType", DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   framework::dataset::make("UseKernelIndices", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+    validate(Accessor(_target_indices), _ref_indices);
+}
+FIXTURE_DATA_TEST_CASE(RunKernelIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                   combine(PoolingLayerKernelIndicesDatasetFPSmall,
+                                                                                                                           framework::dataset::make("DataType", DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                   framework::dataset::make("UseKernelIndices", { true })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
     validate(Accessor(_target_indices), _ref_indices);
 }
-
 FIXTURE_DATA_TEST_CASE(RunSpecial, NESpecialPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::PoolingLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerDatasetFPSmall,
                                                                                                                   framework::dataset::make("DataType",
                                                                                                                           DataType::F32))),
                                                                                                           pool_data_layout_dataset))
@@ -150,6 +195,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEPoolingLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(datasets::PoolingTypes(),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 0, 0) })),
+                                       framework::dataset::make("ExcludePadding", { false })),
+                               framework::dataset::make("DataType", DataType::F32))),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
                                                                                                                 framework::dataset::make("DataType",
                                                                                                                         DataType::F32))),
@@ -158,11 +214,32 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE(CornerCases)
+FIXTURE_DATA_TEST_CASE(PoolRegionCompletelyOutsideInput, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(pool_outside_input_dataset,
+                       framework::dataset::make("DataType",
+                                                DataType::F32)),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // CornerCases
 TEST_SUITE_END() // FP32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                  combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                          framework::dataset::make("DataType",
+                                                                                                                                  DataType::F16))),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                  framework::dataset::make("UseKernelIndices", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+    validate(Accessor(_target_indices), _ref_indices);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerDatasetFPSmall,
                                                                                                                  framework::dataset::make("DataType", DataType::F16))),
                                                                                                          pool_data_layout_dataset))
 {
@@ -176,6 +253,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<half>, framework::Dataset
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
+TEST_SUITE(CornerCases)
+FIXTURE_DATA_TEST_CASE(PoolRegionCompletelyOutsideInput, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(pool_outside_input_dataset,
+                       framework::dataset::make("DataType",
+                                                DataType::F16)),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // CornerCases
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // Float
@@ -184,20 +271,64 @@ TEST_SUITE(Quantized)
 
 template <typename T>
 using NEPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture<Tensor, Accessor, NEPoolingLayer, T>;
+template <typename T>
+using NEPoolingLayerQuantizedMixedDataLayoutFixture = PoolingLayerValidationQuantizedFixture<Tensor, Accessor, NEPoolingLayer, T, true>;
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8Small,
-                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8))),
-                                                                                                                     pool_data_layout_dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallNCHW, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(PoolingLayerDatasetQASYMM8Small,
+                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                       qasymm8_in_qinfo_dataset),
+                       qasymm8_in_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                     combine(PoolingLayerDatasetQASYMM8Small,
+                                                                                                                             framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                     qasymm8_in_qinfo_dataset),
+                                                                                                                     qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEPoolingLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 255.f, 10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 255.f, 5) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8Small,
-                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
-                                                                                                                    pool_data_layout_dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                    combine(PoolingLayerDatasetQASYMM8Small,
+                                                                                                                            framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    qasymm8_signed_in_qinfo_dataset),
+                                                                                                                    qasymm8_signed_in_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEPoolingLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_s);
@@ -205,7 +336,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<int8_t>, framewo
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // PoolingLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/PriorBoxLayer.cpp b/tests/validation/NEON/PriorBoxLayer.cpp
index cc63877a5f..9a9b9a8e3d 100644
--- a/tests/validation/NEON/PriorBoxLayer.cpp
+++ b/tests/validation/NEON/PriorBoxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,7 +90,7 @@ TEST_SUITE_END() // Float
 TEST_SUITE_END() // FP32
 
 TEST_SUITE_END() // PriorBoxLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/QLSTMLayerNormalization.cpp b/tests/validation/NEON/QLSTMLayerNormalization.cpp
index 8925d0b39e..9738213114 100644
--- a/tests/validation/NEON/QLSTMLayerNormalization.cpp
+++ b/tests/validation/NEON/QLSTMLayerNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -167,7 +167,7 @@ TEST_SUITE(Quantized)
 TEST_SUITE(QSYMM16)
 
 /** Tests will be targetting
- * - Comparison between NEON kernel and the exact same but scalar version of reference kernel
+ * - Comparison between optimized kernel and the exact same but scalar version of reference kernel
  * - Input shapes of 1D and 2D with the first dimension covers boundary values of 128-bit vector size (0~3 iterations)
  * - Weight and bias 1D shape that have same size as that of input shapes
  * - Quantization scale is greater and smaller than one.
@@ -179,7 +179,7 @@ TEST_SUITE(QSYMM16)
  * - The algorithm has been sensitive to quantization scale but it is hard to fully test
  *   the sensitivity due to aforementioned reason.
  * - Again, it is hard to fully test corner values due to the exact same algorithm of the
- *   reference kernel and the NEON kernel.
+ *   reference kernel and the optimized kernel.
  */
 
 constexpr uint32_t qsymm16_per_vector = vector_size_byte / sizeof(int16_t);
@@ -216,7 +216,7 @@ FIXTURE_DATA_TEST_CASE(RandomValue2D, NEQLSTMLayerNormalizationFixture<int16_t>,
 TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // QLSTMLayerNormalization
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp
index 04b3a78972..bab7490762 100644
--- a/tests/validation/NEON/QuantizationLayer.cpp
+++ b/tests/validation/NEON/QuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/QuantizationLayerFixture.h"
 
+
 namespace arm_compute
 {
 namespace test
@@ -97,7 +98,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8Signed, NEQuantizationLayerQASYMM8SignedFi
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_u8);
+    validate(Accessor(_target), _reference, tolerance_s8);
 }
 FIXTURE_DATA_TEST_CASE(RunSmallQASYMM16, NEQuantizationLayerQASYMM16Fixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes,
                        framework::dataset::make("DataType", DataType::F32)),
@@ -140,7 +141,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8Signed, NEQuantizationLayerQASYMM8SignedFi
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_u8);
+    validate(Accessor(_target), _reference, tolerance_s8);
 }
 FIXTURE_DATA_TEST_CASE(RunSmallQASYMM16, NEQuantizationLayerQASYMM16Fixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes,
                        framework::dataset::make("DataType", DataType::F16)),
@@ -182,7 +183,16 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8GenFixture<uin
                        framework::dataset::make("DataType", DataType::QASYMM8)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, 15) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, 15), QuantizationInfo(0.5f, 25) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(ConvertUint8toInt8, NEQuantizationLayerQASYMM8GenFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(QuantizationSmallShapes,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM8_SIGNED })),
+                       framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(2.0f, -1) })),
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, 127) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
@@ -191,7 +201,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8_SIGNED, NEQuantizationLayerQASYMM8_SIGNED
                        framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8_SIGNED })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.0f, 10), QuantizationInfo(2.0f, -25) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 15) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 15), QuantizationInfo(1.0f, 127) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
@@ -211,7 +221,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8_SIGNED, NEQuantizationLayerQASYMM8_SIGNED
                        framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8_SIGNED })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.0f, 10) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, -5) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, -5), QuantizationInfo(1.0f, 43) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
@@ -220,16 +230,26 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8GenFixture<int
                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(2.0f, 10), QuantizationInfo(2.0f, -25) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 30) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 30), QuantizationInfo(2.0f, -128) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(ConvertInt8toUint8, NEQuantizationLayerQASYMM8_SIGNEDGenFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(QuantizationSmallShapes,
+                       framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
+                       framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.0f, 0) })),
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, -128) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_s8);
+}
+
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // QuantizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/RNNLayer.cpp b/tests/validation/NEON/RNNLayer.cpp
index 7a3117a0b3..979aa0f2c5 100644
--- a/tests/validation/NEON/RNNLayer.cpp
+++ b/tests/validation/NEON/RNNLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,8 +39,11 @@ namespace validation
 {
 namespace
 {
-RelativeTolerance<float> tolerance_f32(0.001f);
-RelativeTolerance<half>  tolerance_f16(half(0.1));
+RelativeTolerance<float> tolerance_f32(0.001f); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType:F32 */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+RelativeTolerance<half> tolerance_f16(half(0.1)); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType:F16 */
+constexpr float         abs_tolerance_f16(0.02f); /**< Absolute tolerance value for comparing reference's output against implementation's output for DataType:F16 */
+#endif                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace
 
 TEST_SUITE(NEON)
@@ -136,12 +139,12 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NERNNLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallRNNLayerDataset(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16);
+    validate(Accessor(_target), _reference, tolerance_f16, 0.02f, abs_tolerance_f16);
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // RNNLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ROIAlignLayer.cpp b/tests/validation/NEON/ROIAlignLayer.cpp
index e475c46c7d..98c92a0b20 100644
--- a/tests/validation/NEON/ROIAlignLayer.cpp
+++ b/tests/validation/NEON/ROIAlignLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,6 +53,7 @@ AbsoluteTolerance<float> absolute_tolerance_f16(0.001f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+constexpr AbsoluteTolerance<int8_t> tolerance_qasymm8_s(1);
 } // namespace
 
 TEST_SUITE(NEON)
@@ -154,13 +155,13 @@ FIXTURE_DATA_TEST_CASE(Small, NEROIAlignLayerQuantizedFixture<int8_t>, framework
                                framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(2.f / 255.f, 120) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // RoiAlign
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ROIPoolingLayer.cpp b/tests/validation/NEON/ROIPoolingLayer.cpp
new file mode 100644
index 0000000000..8b5147e57f
--- /dev/null
+++ b/tests/validation/NEON/ROIPoolingLayer.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/Globals.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ROIDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ROIPoolingLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> relative_tolerance_f32(0.01f);
+AbsoluteTolerance<float> absolute_tolerance_f32(0.001f);
+
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+} // end namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(RoiPooling)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Successful test
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::QASYMM8), // Successful test (quantized)
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Incorrect rois type
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching data type input/output
+                                                       TensorInfo(TensorShape(250U, 128U, 2U), 1, DataType::F32), // Mismatching depth size input/output
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching number of rois and output batch size
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Invalid number of values per ROIS
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching height and width input/output
+
+                                                     }),
+               framework::dataset::make("RoisInfo", { TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::F16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 10U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(4, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                    })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(5U, 5U, 3U, 4U), 1, DataType::F32),
+                                                     })),
+               framework::dataset::make("PoolInfo", { ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      })),
+               framework::dataset::make("Expected", { true, true, false, false, false, false, false })),
+               input_info, rois_info, output_info, pool_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(NEROIPoolingLayer::validate(&input_info.clone()->set_is_resizable(true), &rois_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true), pool_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using NEROIPoolingLayerFloatFixture = ROIPoolingLayerFixture<Tensor, Accessor, NEROIPoolingLayer, float>;
+
+TEST_SUITE(Float)
+FIXTURE_DATA_TEST_CASE(SmallROIPoolingLayerFloat, NEROIPoolingLayerFloatFixture, framework::DatasetMode::ALL,
+                       framework::dataset::combine(framework::dataset::combine(datasets::SmallROIDataset(),
+                                                                               framework::dataset::make("DataType", { DataType::F32 })),
+                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, relative_tolerance_f32, .02f, absolute_tolerance_f32);
+}
+
+TEST_SUITE_END() // Float test suite end
+
+// Begin quantized tests
+TEST_SUITE(Quantized)
+template <typename T>
+using NEROIPoolingLayerQuantizedFixture = ROIPoolingLayerQuantizedFixture<Tensor, Accessor, NEROIPoolingLayer, T>;
+
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(Small, NEROIPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(datasets::SmallROIDataset(),
+                                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 127) })),
+                               framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(2.f / 255.f, 120) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // end qasymm8 tests
+TEST_SUITE_END() // end quantized tests
+
+TEST_SUITE_END() // RoiPooling
+TEST_SUITE_END() // NEON
+
+} // validation end
+} // test namespace end
+} // arm_compute namespace end
diff --git a/tests/validation/NEON/Range.cpp b/tests/validation/NEON/Range.cpp
index 57e891d09b..fda7b2c448 100644
--- a/tests/validation/NEON/Range.cpp
+++ b/tests/validation/NEON/Range.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -172,7 +172,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // Range
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp
index 02cfcee262..8ca0bb53a7 100644
--- a/tests/validation/NEON/ReduceMean.cpp
+++ b/tests/validation/NEON/ReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,14 +46,15 @@ constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value f
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 constexpr AbsoluteTolerance<float> tolerance_f16(0.03f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
 #endif                                                   // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
 #ifdef __aarch64__
-constexpr AbsoluteTolerance<int8_t> tolerance_s8(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
-#else                                                // __aarch64__
-constexpr AbsoluteTolerance<int8_t> tolerance_s8(2); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
-#endif                                               // __aarch64__
+constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_s8(1);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#else // __aarch64__
+constexpr AbsoluteTolerance<uint8_t> tolerance_u8(2);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_s8(2);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#endif // __aarch64__
 
-const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(0, 1, 2, 3) }),
+const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(2, 3), Coordinates(0, 1, 2, 3) }),
                                framework::dataset::make("KeepDims", { true }));
 const auto axis_drop = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1), Coordinates(3) }), framework::dataset::make("KeepDims", { false }));
 } // namespace
@@ -212,7 +213,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // ReduceMean
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ReductionOperation.cpp b/tests/validation/NEON/ReductionOperation.cpp
index ed17e6968e..48c3a1a788 100644
--- a/tests/validation/NEON/ReductionOperation.cpp
+++ b/tests/validation/NEON/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -194,7 +194,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationQuantizedFixture<int8_t>, f
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE_END() // ReductionOperation
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Remap.cpp b/tests/validation/NEON/Remap.cpp
deleted file mode 100644
index fe622ff95f..0000000000
--- a/tests/validation/NEON/Remap.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/RemapFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(0);
-constexpr float                      tolerance_number = 0.f;
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Remap)
-
-template <typename T>
-using NERemapFixture = RemapValidationFixture<Tensor, Accessor, NERemap, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NERemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NERemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ReorderLayer.cpp b/tests/validation/NEON/ReorderLayer.cpp
new file mode 100644
index 0000000000..839ad0ac92
--- /dev/null
+++ b/tests/validation/NEON/ReorderLayer.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__)
+
+#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ReorderLayerDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ReorderFixture.h"
+#include "src/core/NEON/kernels/NEReorderKernel.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using framework::dataset::make;
+
+TEST_SUITE(NEON)
+TEST_SUITE(ReorderLayer)
+
+template <typename T>
+using NEReorderLayerAlias = ReorderValidationFixture<Tensor, Accessor, NEReorderLayer, T>;
+
+TEST_SUITE(FP32)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+DATA_TEST_CASE(ValidateReorderOHWIo8, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("InShape",{ TensorShape(10U, 9U), TensorShape(234U, 301U) }),
+                                                                     make("OutShape", { TensorShape(10U, 16U), TensorShape(234U, 304U) })
+                                                                    ),
+                                                                    zip(
+                                                                        make("InputWeightFormat", {WeightFormat::OHWI}),
+                                                                        make("OutputWeightFormat", {WeightFormat::OHWIo8})
+                                                                    )),
+            input_shape, output_shape,  input_wf,  output_wf)
+{
+    if(Scheduler::get().cpu_info().has_sve()){
+        arm_compute::NEReorderLayer reorder_layer;
+        int vector_length = arm_gemm::utils::get_vector_length<float>();
+        bool expected_bool_status = false;
+        if (vector_length == 8)
+        {
+            expected_bool_status = true;
+        }
+
+        TensorInfo input_tensor_info(input_shape, 1, DataType::F32);
+        TensorInfo output_tensor_info(output_shape, 1, DataType::F32);
+
+        Status status = reorder_layer.validate(&input_tensor_info, &output_tensor_info, input_wf, output_wf);
+
+        ARM_COMPUTE_EXPECT((expected_bool_status == bool(status)), framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), make("DataType", DataType::F32)))
+{
+    // Validate output
+    if (_hardware_supports)
+    {
+        validate(Accessor(_target), _reference);
+    }
+}
+#endif // ARM_COMPUTE_ENABLE_SVE
+
+FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE_END() // FP32
+
+TEST_SUITE_END() // ReorderLayer
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif  // defined(__aarch64__)
diff --git a/tests/validation/NEON/ReorgLayer.cpp b/tests/validation/NEON/ReorgLayer.cpp
index cd8c10bd61..38bef51ed3 100644
--- a/tests/validation/NEON/ReorgLayer.cpp
+++ b/tests/validation/NEON/ReorgLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -126,7 +126,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEReorgLayerFixture<int8_t>, framework::Dataset
 TEST_SUITE_END() // S8
 
 TEST_SUITE_END() // ReorgLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ReshapeLayer.cpp b/tests/validation/NEON/ReshapeLayer.cpp
index bf39c399a5..e9f114d491 100644
--- a/tests/validation/NEON/ReshapeLayer.cpp
+++ b/tests/validation/NEON/ReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,9 @@ input_info, output_info, expected)
 template <typename T>
 using NEReshapeLayerFixture = ReshapeLayerValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
 
+template <typename T>
+using NEReshapeLayerPaddedFixture = ReshapeLayerPaddedValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
+
 TEST_SUITE(Float)
 TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
@@ -84,8 +87,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //F32
+TEST_SUITE_END() //Float
 
 TEST_SUITE(Integer)
 TEST_SUITE(S8)
@@ -94,7 +97,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int8_t>, framework::Datas
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() //S8
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
@@ -102,11 +105,41 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int16_t>, framework::Data
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
+
+TEST_SUITE(Padded)
+TEST_SUITE(Float)
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S32
+TEST_SUITE_END() //Float
+
+TEST_SUITE(Integer)
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int8_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S8
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
+TEST_SUITE_END() //Padded
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //ReshapeLayer
+TEST_SUITE_END() //NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Reverse.cpp b/tests/validation/NEON/Reverse.cpp
index 4278f35b30..7b5337f14b 100644
--- a/tests/validation/NEON/Reverse.cpp
+++ b/tests/validation/NEON/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,8 @@ namespace validation
 {
 namespace
 {
-auto run_small_dataset = combine(datasets::SmallShapes(), datasets::Tiny1DShapes());
+using framework::dataset::make;
+auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes());
 auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes());
 
 } // namespace
@@ -53,28 +54,31 @@ TEST_SUITE(Reverse)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
+        make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis shape
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis length (> 4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Mismatching shapes
+                                            TensorInfo(TensorShape(32U, 13U, 17U, 3U, 2U), 1, DataType::U8), // Unsupported source dimensions (>4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         }),
-        framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
+        make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U, 13U, 2U), 1, DataType::U8),
+                                            TensorInfo(TensorShape(32U, 13U, 17U, 3U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         })),
-        framework::dataset::make("AxisInfo", { TensorInfo(TensorShape(3U), 1, DataType::U8),
+        make("AxisInfo", { TensorInfo(TensorShape(3U), 1, DataType::U8),
                                            TensorInfo(TensorShape(2U, 10U), 1, DataType::U32),
                                            TensorInfo(TensorShape(8U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
+                                           TensorInfo(TensorShape(2U), 1, DataType::U32),
         })),
-        framework::dataset::make("Expected", { false, false, false, false, true, true})),
+        make("Expected", { false, false, false, false, false, true, true})),
         src_info, dst_info, axis_info, expected)
 {
     Status s = NEReverse::validate(&src_info.clone()->set_is_resizable(false),
@@ -95,7 +99,11 @@ TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -104,7 +112,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<half>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -116,7 +128,11 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -125,7 +141,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<float>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -138,7 +158,11 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -147,7 +171,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -156,7 +184,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // Reverse
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index 9a1e9b01b3..f1209a21ac 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,16 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ScaleValidationDataset.h"
-#include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
-#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ScaleFixture.h"
 
@@ -50,8 +44,8 @@ using datasets::ScaleSamplingPolicySet;
 using datasets::ScaleAlignCornersSamplingPolicySet;
 
 /** We consider vector size in byte 64 since the maximum size of
- * a vector used by @ref NEScaleKernel is currently 64-byte (float32x4x4).
- * There are possibility to reduce test time further by using
+ * a vector used by the kernel is currently 64-byte (float32x4x4).
+ * There is possibility to reduce test time further by using
  * smaller vector sizes for different data types where applicable.
  */
 constexpr uint32_t vector_byte = 64;
@@ -62,25 +56,31 @@ constexpr uint32_t num_elements_per_vector()
     return vector_byte / sizeof(T);
 }
 
-/** Scale data types */
-const auto ScaleDataTypes = framework::dataset::make("DataType",
+/** Quantization information data set */
+const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
 {
-    DataType::U8,
-    DataType::S16,
-    DataType::F32,
+    QuantizationInfo(0.5f, -10),
 });
 
 /** Quantization information data set */
-const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
+const auto InputQuantizationInfoSet = framework::dataset::make("InputQuantizationInfo",
 {
     QuantizationInfo(0.5f, -10),
 });
 
+/** Quantization information data set */
+const auto OutputQuantizationInfoSet = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(0.2f, 20),
+});
+
 /** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
+constexpr AbsoluteTolerance<int8_t>  tolerance_s8(1);
 constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
 RelativeTolerance<float>             tolerance_f32(0.05);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr float         abs_tolerance_f16(0.01f);
 RelativeTolerance<half> tolerance_f16(half(0.1));
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
@@ -94,9 +94,8 @@ TEST_SUITE(Validate)
 
 /** Validate test suite is to test ARM_COMPUTE_RETURN_ON_* macros
  * we use to check the validity of given arguments in @ref NEScale
- * and subsequent call to @ref NEScaleKernel.
  * Since this is using validate() of @ref NEScale, which pre-adjust
- * arguments for @ref NEScaleKernel, the following conditions in
+ * arguments for the kernel, the following conditions in
  * the kernel are not currently tested.
  * - The same input and output
  * - Data type of offset, dx and dy
@@ -156,8 +155,6 @@ TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
         { DataType::BFLOAT16, false },
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         { DataType::F16, true },
-#else  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { DataType::F16, false },
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         { DataType::F32, true },
         { DataType::F64, false },
@@ -317,11 +314,18 @@ DATA_TEST_CASE(CheckNoPaddingInterpAREA, framework::DatasetMode::ALL, combine(co
 template <typename T>
 using NEScaleFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T>;
 template <typename T>
+using NEScaleMixedDataLayoutFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T, true>;
+template <typename T>
 using NEScaleQuantizedFixture = ScaleValidationQuantizedFixture<Tensor, Accessor, NEScale, T>;
+template <typename T>
+using NEScaleDifferentOutputQuantizedFixture = ScaleValidationDifferentOutputQuantizedFixture<Tensor, Accessor, NEScale, T>;
+template <typename T>
+using NEScaleQuantizedMixedDataLayoutFixture = ScaleValidationQuantizedFixture<Tensor, Accessor, NEScale, T, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-const auto f32_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+const auto f32_shape      = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+const auto f32_shape_nhwc = combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F32));
 FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
@@ -331,6 +335,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
 FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
@@ -340,10 +353,38 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<float>, framework::D
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMediumNHWC, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumMixedDataLayoutNHWC, NEScaleMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumAlignCornersNHWC, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
 TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-const auto f16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+const auto f16_shape      = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+const auto f16_shape_nhwc = combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F16));
 FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
@@ -351,7 +392,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::A
     const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(Accessor(_target), _reference, valid_region, tolerance_f16);
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleAlignCornersSamplingPolicySet))
 {
@@ -360,7 +401,34 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<half>, framework::Da
     const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(Accessor(_target), _reference, valid_region, tolerance_f16);
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumNHWC, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumMixedDataLayoutNHWC, NEScaleMixedDataLayoutFixture<half>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumAlignCornersNHWC, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -388,6 +456,27 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<uint8_t>, framework:
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
 TEST_SUITE_END() // U8
+TEST_SUITE(S8)
+const auto s8_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int8_t>())), framework::dataset::make("DataType", DataType::S8));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_s8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_s8);
+}
+TEST_SUITE_END() // S8
 TEST_SUITE(S16)
 const auto s16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int16_t>())), framework::dataset::make("DataType", DataType::S16));
 FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleSamplingPolicySet))
@@ -423,6 +512,26 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::Da
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet,
+                       QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
 FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleAlignCornersSamplingPolicySet,
                        QuantizationInfoSet))
 {
@@ -446,6 +555,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<int8_t>, framework::Dat
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+                       ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
+}
 FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleAlignCornersSamplingPolicySet,
                        QuantizationInfoSet))
 {
@@ -460,7 +579,7 @@ TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // Scale
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Schaar.cpp b/tests/validation/NEON/Schaar.cpp
deleted file mode 100644
index 28c43ccf1c..0000000000
--- a/tests/validation/NEON/Schaar.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/GradientDimensionDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ScharrFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Scharr)
-
-TEST_SUITE(W3x3)
-using NEScharr3x3Fixture = ScharrValidationFixture<Tensor, Accessor, NEScharr3x3, uint8_t, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScharr3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                      Format::U8)),
-                                                                                              datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEScharr3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                      Format::U8)),
-                                                                                              datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Select.cpp b/tests/validation/NEON/Select.cpp
index 199b520ac8..40744581b0 100644
--- a/tests/validation/NEON/Select.cpp
+++ b/tests/validation/NEON/Select.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -140,7 +140,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // Select
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Slice.cpp b/tests/validation/NEON/Slice.cpp
index 54b0fbf9d2..d5549c8cdb 100644
--- a/tests/validation/NEON/Slice.cpp
+++ b/tests/validation/NEON/Slice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -111,7 +111,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // Slice
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Sobel.cpp b/tests/validation/NEON/Sobel.cpp
deleted file mode 100644
index 86d2c67704..0000000000
--- a/tests/validation/NEON/Sobel.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
-#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
-#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/SobelFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Sobel)
-
-TEST_SUITE(W3x3)
-using NESobel3x3Fixture = SobelValidationFixture<Tensor, Accessor, NESobel3x3, uint8_t, int16_t>;
-
-TEST_SUITE(X)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-TEST_SUITE_END()
-TEST_SUITE(Y)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE(XY)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE(W5x5)
-using NESobel5x5Fixture = SobelValidationFixture<Tensor, Accessor, NESobel5x5, uint8_t, int16_t>;
-
-TEST_SUITE(X)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-TEST_SUITE_END()
-TEST_SUITE(Y)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE(XY)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE(W7x7)
-using NESobel7x7Fixture = SobelValidationFixture<Tensor, Accessor, NESobel7x7, uint8_t, int32_t>;
-TEST_SUITE(X)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-TEST_SUITE_END()
-TEST_SUITE(Y)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE(XY)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 2a9e30604e..94d0866c38 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,21 +25,22 @@
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuSoftmaxKernel.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/SoftmaxLayerFixture.h"
-
 namespace arm_compute
 {
 namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -51,7 +52,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -62,56 +63,55 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 
 TEST_SUITE(NEON)
 TEST_SUITE(SoftmaxLayer)
-
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                     })),
-               framework::dataset::make("beta", { 1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                })),
-               framework::dataset::make("axis", { 0,
-                                                  0,
-                                                  0,
-                                                  1,
-                                                  0,
-                                                  -1,
-                                                  2,
-                                                  -3,
-                                                })),
-               framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
-               input_info, output_info, beta, axis, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
+                                    QuantizationInfo(1.f/256, 12)),
+                        }),
+    make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+                        TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        }),
+    make("beta", { 1.0,
+                   2.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                }),
+    make("axis", { 0,
+                   0,
+                   0,
+                   1,
+                   0,
+                   -1,
+                   2,
+                   -3,
+                }),
+    make("Expected", { false, false, false, true, true, true, false, false })),
+    input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
 }
@@ -121,29 +121,80 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 template <typename T>
 using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftmaxLayer, T>;
 
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+    concat(
+        combine(
+            make("CpuExt", std::string("neon")),
+            make("DataType", { DataType::F32,
+                            DataType::F16,
+                            DataType::QASYMM8,
+                            DataType::QASYMM8_SIGNED})
+        ),
+        combine(
+            make("CpuExt", std::string("sme2")),
+            make("DataType", { DataType::F32,
+                            DataType::F16}))
+        ),
+        cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "neon");
+    cpu_isa.sme2 = (cpu_ext == "sme2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */, CPUInfo::get().get_sme2_vector_length()},
+        cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = cpu_ext + "_" + cpu_impl_dt(data_type) + "_softmax";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                         framework::dataset::make("Axis", { 0, 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 0, 2, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f }),
+        make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
-                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -152,26 +203,30 @@ TEST_SUITE_END() //FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0, -2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -184,29 +239,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor,
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0, 1, -2 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })),
+        make("Axis", { 0, 1, -2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                   combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                   framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.0f })
+        ),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -214,20 +280,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framew
 TEST_SUITE_END() //QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0, 1, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, 1, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
diff --git a/tests/validation/NEON/SpaceToBatchLayer.cpp b/tests/validation/NEON/SpaceToBatchLayer.cpp
index d9fea97e23..61642c6ff9 100644
--- a/tests/validation/NEON/SpaceToBatchLayer.cpp
+++ b/tests/validation/NEON/SpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,7 +162,7 @@ FIXTURE_DATA_TEST_CASE(Large, NESpaceToBatchLayerQuantizedFixture<uint8_t>, fram
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // SpaceToBatch
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/SpaceToDepthLayer.cpp b/tests/validation/NEON/SpaceToDepthLayer.cpp
index 2cdfb06680..6d87bc4fd4 100644
--- a/tests/validation/NEON/SpaceToDepthLayer.cpp
+++ b/tests/validation/NEON/SpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -107,7 +107,7 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // SpaceToDepthLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Split.cpp b/tests/validation/NEON/Split.cpp
index e7133fa530..72df2ad663 100644
--- a/tests/validation/NEON/Split.cpp
+++ b/tests/validation/NEON/Split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -166,7 +166,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // Split
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/StackLayer.cpp b/tests/validation/NEON/StackLayer.cpp
index df0de81b4f..3828010c7b 100644
--- a/tests/validation/NEON/StackLayer.cpp
+++ b/tests/validation/NEON/StackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,69 +44,74 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
 namespace
 {
 // *INDENT-OFF*
 // clang-format off
 /** Data types */
-const auto data_types = framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
+const auto data_types = make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
 
 /** Num tensors values to test */
-const auto n_values = framework::dataset::make("NumTensors", { 3, 4 });
+const auto n_values = make("NumTensors", { 3, 4 });
 
 /** Shapes 1D to test */
-const auto shapes_1d_small = combine(datasets::Small1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_small = combine(datasets::Small1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_small = combine(datasets::Small2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_small = combine(datasets::Small2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_small = combine(datasets::Small3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_small = combine(datasets::Small3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_small = combine(datasets::Small4DShapes(), framework::dataset::make("Axis", -4, 5));
+const auto shapes_4d_small = combine(datasets::Small4DShapes(), make("Axis", -4, 5));
 
 /** Shapes 1D to test */
-const auto shapes_1d_large = combine(datasets::Large1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_large = combine(datasets::Large1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_large = combine(datasets::Medium2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_large = combine(datasets::Medium2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_large = combine(datasets::Medium3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_large = combine(datasets::Medium3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_large = combine(datasets::Medium4DShapes(), framework::dataset::make("Axis", -4, 5));
+const auto shapes_4d_large = combine(datasets::Medium4DShapes(), make("Axis", -4, 5));
 } // namespace
 
 /** Fixture to use */
 template<typename T>
 using NEStackLayerFixture = StackLayerValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
 
+template<typename T>
+using NEStackLayerWithPaddingFixture = StackLayerWithPaddingValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
+
 using namespace arm_compute::misc::shape_calculator;
 
 TEST_SUITE(NEON)
 TEST_SUITE(StackLayer)
 
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                                      framework::dataset::make("InputInfo",
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+make("InputInfo",
 {
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::U8) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(2U, 3U), 1, DataType::S32) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::S32) },
 }),
-framework::dataset::make("OutputInfo",
+make("OutputInfo",
 {
     TensorInfo(TensorShape(1U, 9U, 8U), 1, DataType::U8),   // Passes, stack 1 tensor on x axis
     TensorInfo(TensorShape(1U, 3U, 2U), 1, DataType::U8),   // Passes, stack 3 tensors on y axis
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::S32),  // fails axis <  (- input's rank)
     TensorInfo(TensorShape(3U, 7U, 5U), 1, DataType::S32),  // fails, input dimensions > 4
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::U8),   // fails mismatching data types
-})),
-framework::dataset::make("Axis", { -3, 1, -4, -3, 1 })),
-framework::dataset::make("Expected", { true, true, false, false, false })),
+}),
+make("Axis", { -3, 1, -4, -3, 1 }),
+make("Expected", { true, true, false, false, false })),
 input_info, output_info, axis, expected)
 {
     std::vector<TensorInfo>    ti(input_info);
@@ -121,18 +126,18 @@ input_info, output_info, axis, expected)
 TEST_SUITE(Shapes1D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -141,18 +146,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -161,18 +166,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -183,18 +188,18 @@ TEST_SUITE_END() // Shapes1D
 TEST_SUITE(Shapes2D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -203,18 +208,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -223,18 +228,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -245,18 +250,18 @@ TEST_SUITE_END() // Shapes2D
 TEST_SUITE(Shapes3D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -265,18 +270,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -285,18 +290,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -307,18 +312,29 @@ TEST_SUITE_END() // Shapes3D
 TEST_SUITE(Shapes4D)
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+// Testing the case with padding for only 4d shapes and for one data type. This is because the underlying code
+// path depends only on the padding, which isn't affected by the shapes or data types.
+FIXTURE_DATA_TEST_CASE(RunSmallWithPadding, NEStackLayerWithPaddingFixture<int>, framework::DatasetMode::ALL,
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -327,18 +343,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -347,26 +363,39 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes4D
+
+TEST_SUITE(HighDimensional)
+// The Cpu implementation supports tensors of size 4D+, but reference implementation does not.
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, NEStackLayerFixture<char>, framework::DatasetMode::DISABLED,
+    combine(make("Shape", { TensorShape{2U, 3U, 4U, 5U, 3U} }),
+            make("Axis", { 5, 0, -3, 2 }),
+            make("DataType", { DataType::S8 }),
+            make("NumTensors", { 3 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // HighDimensional
 TEST_SUITE_END() // StackLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/StridedSlice.cpp b/tests/validation/NEON/StridedSlice.cpp
index 8332134b1b..a1b3cef801 100644
--- a/tests/validation/NEON/StridedSlice.cpp
+++ b/tests/validation/NEON/StridedSlice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -113,7 +113,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // StridedSlice
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/NEON/TableLookup.cpp b/tests/validation/NEON/TableLookup.cpp
deleted file mode 100644
index 9543d827c4..0000000000
--- a/tests/validation/NEON/TableLookup.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/LutAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-
-#include "tests/validation/Helpers.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/TableLookupFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(TableLookup)
-
-template <typename T>
-using NETableLookupFixture = TableLookupValidationFixture<Tensor, Accessor, NETableLookup, LutAccessor<T>, Lut, T>;
-TEST_SUITE(U8)
-
-FIXTURE_DATA_TEST_CASE(RunSmallU8, NETableLookupFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLargeU8, NETableLookupFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmallS16, NETableLookupFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S16)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLargeS16, NETableLookupFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::S16)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Threshold.cpp b/tests/validation/NEON/Threshold.cpp
deleted file mode 100644
index 0094a027f9..0000000000
--- a/tests/validation/NEON/Threshold.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/datasets/ThresholdDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ThresholdFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Threshold)
-
-template <typename T>
-using ThresholdFixture = ThresholdValidationFixture<Tensor, Accessor, NEThreshold, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, ThresholdFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SmallShapes(), datasets::MixedThresholdDataset()),
-                                                                                                     framework::dataset::make("DataType",
-                                                                                                             DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, ThresholdFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), datasets::MixedThresholdDataset()),
-                                                                                                     framework::dataset::make("DataType",
-                                                                                                             DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Tile.cpp b/tests/validation/NEON/Tile.cpp
index aa96c95aca..2246c491f2 100644
--- a/tests/validation/NEON/Tile.cpp
+++ b/tests/validation/NEON/Tile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -134,7 +134,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // Tile
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/UNIT/DynamicTensor.cpp b/tests/validation/NEON/UNIT/DynamicTensor.cpp
index 464633e7aa..373cc574c8 100644
--- a/tests/validation/NEON/UNIT/DynamicTensor.cpp
+++ b/tests/validation/NEON/UNIT/DynamicTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,7 +117,7 @@ FIXTURE_DATA_TEST_CASE(DynamicTensorType2Pipeline, NEDynamicTensorType2PipelineF
 }
 TEST_SUITE_END() // DynamicTensor
 TEST_SUITE_END() // UNIT
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/UNIT/MemoryManager.cpp b/tests/validation/NEON/UNIT/MemoryManager.cpp
index 83a9fcb332..2c57b534fe 100644
--- a/tests/validation/NEON/UNIT/MemoryManager.cpp
+++ b/tests/validation/NEON/UNIT/MemoryManager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,15 +62,15 @@ TEST_CASE(BlobMemoryManagerSimpleWithinFunctionLevel, framework::DatasetMode::AL
     norm_layer_1.configure(&src, &dst, NormalizationLayerInfo(NormType::CROSS_MAP, 3));
     norm_layer_2.configure(&src, &dst, NormalizationLayerInfo(NormType::IN_MAP_1D, 3));
 
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
     // Allocate tensors
     src.allocator()->allocate();
     dst.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
     // Finalize memory manager
     mm->populate(allocator, 1 /* num_pools */);
diff --git a/tests/validation/NEON/UNIT/RuntimeContext.cpp b/tests/validation/NEON/UNIT/RuntimeContext.cpp
index 6070a88d6b..e126aded28 100644
--- a/tests/validation/NEON/UNIT/RuntimeContext.cpp
+++ b/tests/validation/NEON/UNIT/RuntimeContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,24 @@ namespace validation
 {
 TEST_SUITE(NEON)
 TEST_SUITE(UNIT)
+#if defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+TEST_CASE(CpuCapacity, framework::DatasetMode::ALL)
+{
+    CPUInfo& ci =  arm_compute::Scheduler::get().cpu_info();
+    const uint32_t total_num_cpus = ci.get_cpu_num();
+    const uint32_t nonlittle_num_cpus = ci.get_cpu_num_excluding_little();
+    const bool has_lmb = ci.cpu_has_little_mid_big();
+    const uint32_t num_threads = arm_compute::Scheduler::get().num_threads();
+
+    if(has_lmb){
+        ARM_COMPUTE_EXPECT(total_num_cpus!=nonlittle_num_cpus , framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(num_threads==nonlittle_num_cpus , framework::LogLevel::ERRORS);
+    }
+}
+#endif /* defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+
 TEST_SUITE(RuntimeContext)
 
 TEST_CASE(Scheduler, framework::DatasetMode::ALL)
@@ -57,14 +75,14 @@ TEST_CASE(Scheduler, framework::DatasetMode::ALL)
     RuntimeContext ctx;
 
     // Check if it's been initialised properly
-    ARM_COMPUTE_EXPECT(ctx.scheduler() != nullptr, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(ctx.asset_manager() == nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(ctx.scheduler() != nullptr);
+    ARM_COMPUTE_ASSERT(ctx.asset_manager() == nullptr);
 
     // Create a Scheduler
     auto scheduler = SchedulerFactory::create();
     ctx.set_scheduler(scheduler.get());
     // Check if the scheduler has been properly setup
-    ARM_COMPUTE_EXPECT(ctx.scheduler() != nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(ctx.scheduler() != nullptr);
 
     // Create a new activation function
     NEActivationLayer act_layer(&ctx);
@@ -74,14 +92,14 @@ TEST_CASE(Scheduler, framework::DatasetMode::ALL)
 
     act_layer.configure(&src, &dst, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR));
 
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
     // Allocate tensors
     src.allocator()->allocate();
     dst.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
 
     float min_bound = 0;
     float max_bound = 0;
@@ -117,10 +135,10 @@ TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
     act_layer_thread0.configure(&src_t0, &dst_t0, activation_info);
     act_layer_thread1.configure(&src_t1, &dst_t1, activation_info);
 
-    ARM_COMPUTE_EXPECT(src_t0.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_t0.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src_t1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(src_t0.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst_t0.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(src_t1.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst_t1.info()->is_resizable());
 
     // Allocate tensors
     src_t0.allocator()->allocate();
@@ -128,8 +146,8 @@ TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
     src_t1.allocator()->allocate();
     dst_t1.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!src_t0.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!src_t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!src_t0.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!src_t1.info()->is_resizable());
 
     float min_bound = 0;
     float max_bound = 0;
@@ -159,7 +177,7 @@ TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
 
 TEST_SUITE_END() // RuntimeContext
 TEST_SUITE_END() // UNIT
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/UNIT/TensorAllocator.cpp b/tests/validation/NEON/UNIT/TensorAllocator.cpp
index 273d2e0a4f..0aab9ef9b5 100644
--- a/tests/validation/NEON/UNIT/TensorAllocator.cpp
+++ b/tests/validation/NEON/UNIT/TensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,6 @@
 #include "arm_compute/runtime/MemoryRegion.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
-#include "support/MemorySupport.h"
-
 #include "tests/Globals.h"
 #include "tests/Utils.h"
 #include "tests/framework/Asserts.h"
@@ -58,37 +56,37 @@ TEST_CASE(ImportMemory, framework::DatasetMode::ALL)
 
     // Allocate memory buffer
     const size_t total_size = info.total_size();
-    auto         data       = support::cpp14::make_unique<uint8_t[]>(total_size);
+    auto         data       = std::make_unique<uint8_t[]>(total_size);
 
     // Negative case : Import nullptr
     Tensor t1;
     t1.allocator()->init(info);
-    ARM_COMPUTE_EXPECT(!bool(t1.allocator()->import_memory(nullptr)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t1.allocator()->import_memory(nullptr)));
+    ARM_COMPUTE_ASSERT(t1.info()->is_resizable());
 
     // Negative case : Import misaligned pointer
     Tensor       t2;
     const size_t required_alignment = 339;
     t2.allocator()->init(info, required_alignment);
-    ARM_COMPUTE_EXPECT(!bool(t2.allocator()->import_memory(data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t2.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t2.allocator()->import_memory(data.get())));
+    ARM_COMPUTE_ASSERT(t2.info()->is_resizable());
 
     // Negative case : Import memory to a tensor that is memory managed
     Tensor      t3;
     MemoryGroup mg;
     t3.allocator()->set_associated_memory_group(&mg);
-    ARM_COMPUTE_EXPECT(!bool(t3.allocator()->import_memory(data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t3.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t3.allocator()->import_memory(data.get())));
+    ARM_COMPUTE_ASSERT(t3.info()->is_resizable());
 
     // Positive case : Set raw pointer
     Tensor t4;
     t4.allocator()->init(info);
-    ARM_COMPUTE_EXPECT(bool(t4.allocator()->import_memory(data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!t4.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t4.buffer() == reinterpret_cast<uint8_t *>(data.get()), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(t4.allocator()->import_memory(data.get())));
+    ARM_COMPUTE_ASSERT(!t4.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(t4.buffer() == reinterpret_cast<uint8_t *>(data.get()));
     t4.allocator()->free();
-    ARM_COMPUTE_EXPECT(t4.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t4.buffer() == nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(t4.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(t4.buffer() == nullptr);
 }
 
 TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
@@ -111,13 +109,13 @@ TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
     const size_t total_size_in_elems = tensor.info()->tensor_shape().total_size();
     const size_t total_size_in_bytes = tensor.info()->total_size();
     size_t       space               = total_size_in_bytes + required_alignment;
-    auto         raw_data            = support::cpp14::make_unique<uint8_t[]>(space);
+    auto         raw_data            = std::make_unique<uint8_t[]>(space);
 
     void *aligned_ptr = raw_data.get();
-    support::cpp11::align(required_alignment, total_size_in_bytes, aligned_ptr, space);
+    std::align(required_alignment, total_size_in_bytes, aligned_ptr, space);
 
-    ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(aligned_ptr)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(aligned_ptr)));
+    ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
     // Fill tensor
     std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -139,7 +137,7 @@ TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
 
     // Release resources
     tensor.allocator()->free();
-    ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
 }
 
 TEST_CASE(ImportMemoryMallocPadded, framework::DatasetMode::ALL)
@@ -160,10 +158,10 @@ TEST_CASE(ImportMemoryMallocPadded, framework::DatasetMode::ALL)
 
     // Allocate and import tensor
     const size_t total_size_in_bytes = tensor.info()->total_size();
-    auto         raw_data            = support::cpp14::make_unique<uint8_t[]>(total_size_in_bytes);
+    auto         raw_data            = std::make_unique<uint8_t[]>(total_size_in_bytes);
 
-    ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(raw_data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(raw_data.get())));
+    ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
     // Fill tensor while accounting padding
     std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -192,10 +190,10 @@ TEST_CASE(ImportMemoryMallocPadded, framework::DatasetMode::ALL)
 
     // Release resources
     tensor.allocator()->free();
-    ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
 }
 
-#if !defined(BARE_METAL)
+#if !defined(_WIN64) && !defined(BARE_METAL)
 TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 {
     const ActivationLayerInfo act_info(ActivationLayerInfo::ActivationFunction::RELU);
@@ -223,12 +221,12 @@ TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 
     // Map file
     utils::mmap_io::MMappedFile mmapped_file("test_mmap_import.bin", 0 /** Whole file */, 0);
-    ARM_COMPUTE_EXPECT(mmapped_file.is_mapped(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(mmapped_file.is_mapped());
     unsigned char *data = mmapped_file.data();
 
     // Import memory mapped memory
-    ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(data)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(data)));
+    ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
     // Fill tensor
     std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -250,9 +248,9 @@ TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 
     // Release resources
     tensor.allocator()->free();
-    ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
 }
-#endif // !defined(BARE_METAL)
+#endif // !defined(_WIN64) && !defined(BARE_METAL)
 
 TEST_CASE(AlignedAlloc, framework::DatasetMode::ALL)
 {
@@ -264,7 +262,7 @@ TEST_CASE(AlignedAlloc, framework::DatasetMode::ALL)
     t.allocator()->init(info, requested_alignment);
     t.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(t.buffer() != nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(t.buffer() != nullptr);
     ARM_COMPUTE_EXPECT(t.allocator()->alignment() == requested_alignment, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(arm_compute::utility::check_aligned(reinterpret_cast<void *>(t.buffer()), requested_alignment),
                        framework::LogLevel::ERRORS);
diff --git a/tests/validation/NEON/Unstack.cpp b/tests/validation/NEON/Unstack.cpp
index aa8dcc5a65..3e8f1ff324 100644
--- a/tests/validation/NEON/Unstack.cpp
+++ b/tests/validation/NEON/Unstack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -122,7 +122,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEUnstackFixture<uint8_t>, framework::DatasetMo
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE_END() // Unstack
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Upsample.cpp b/tests/validation/NEON/Upsample.cpp
deleted file mode 100644
index 799e513fb1..0000000000
--- a/tests/validation/NEON/Upsample.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/UpsampleLayerFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(UpsampleLayer)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(10U, 10U, 2U), 1, DataType::F32), // Mismatching data type
-                                            TensorInfo(TensorShape(10U, 10U, 2U), 1, DataType::F32), // Invalid output shape
-                                            TensorInfo(TensorShape(10U, 10U, 2U), 1, DataType::F32), // Invalid stride
-                                            TensorInfo(TensorShape(10U, 10U, 2U), 1, DataType::F32), // Invalid policy
-                                            TensorInfo(TensorShape(32U, 32U), 1, DataType::F32),
-                                          }),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(20U, 20U, 2U), 1, DataType::F16),
-                                            TensorInfo(TensorShape(20U, 10U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(20U, 20U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(20U, 20U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(64U, 64U), 1, DataType::F32),
-                                          })),
-    framework::dataset::make("PadInfo", { Size2D(2, 2),
-                                          Size2D(2, 2),
-                                          Size2D(1, 1),
-                                          Size2D(2, 2),
-                                          Size2D(2, 2),
-                                           })),
-   framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                  InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                  InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                  InterpolationPolicy::BILINEAR,
-                                                  InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                })),
-    framework::dataset::make("Expected", { false, false, false, false, true })),
-    input_info, output_info, pad_info, policy, expected)
-{
-    bool is_valid = bool(NEUpsampleLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pad_info, policy));
-    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-template <typename T>
-using NEUpsampleLayerFixture = UpsampleLayerFixture<Tensor, Accessor, NEUpsampleLayer, T>;
-
-template <typename T>
-using NEUpsampleLayerQuantizedFixture = UpsampleLayerQuantizedFixture<Tensor, Accessor, NEUpsampleLayer, T>;
-
-TEST_SUITE(Float)
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEUpsampleLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                   framework::dataset::make("PadInfo", { Size2D(2, 2) })),
-                                                                                                           framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // FP32
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEUpsampleLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                  framework::dataset::make("DataType",
-                                                                                                                          DataType::F16)),
-                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                  framework::dataset::make("PadInfo", { Size2D(2, 2) })),
-                                                                                                          framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // FP16
-#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-TEST_SUITE_END() // Float
-
-TEST_SUITE(Quantized)
-TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEUpsampleLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      framework::dataset::make("PadInfo", { Size2D(2, 2) })),
-                                                                                                                      framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255.f, 10))))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // QASYMM8
-TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEUpsampleLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     framework::dataset::make("PadInfo", { Size2D(2, 2) })),
-                                                                                                                     framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })),
-                                                                                                                     framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255.f, 10))))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // QASYMM8_SIGNED
-TEST_SUITE_END() // Quantized
-
-TEST_SUITE_END() // UpsampleLayer
-TEST_SUITE_END() // NEON
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/WarpAffine.cpp b/tests/validation/NEON/WarpAffine.cpp
deleted file mode 100644
index 91748580a4..0000000000
--- a/tests/validation/NEON/WarpAffine.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/InterpolationPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/WarpAffineFixture.h"
-#include "tests/validation/reference/Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance */
-constexpr AbsoluteTolerance<uint8_t> tolerance(1);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(WarpAffine)
-
-template <typename T>
-using NEWarpAffineFixture = WarpAffineValidationFixture<Tensor, Accessor, NEWarpAffine, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEWarpAffineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                                                                framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                        datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance, 0.02f);
-}
-DISABLED_FIXTURE_DATA_TEST_CASE(RunLarge, NEWarpAffineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                                 framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                 datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance, 0.02f);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/WarpPerspective.cpp b/tests/validation/NEON/WarpPerspective.cpp
deleted file mode 100644
index 28f206d4d1..0000000000
--- a/tests/validation/NEON/WarpPerspective.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/WarpPerspectiveFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
-constexpr float                      tolerance_number = 0.2f;
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(WarpPerspective)
-
-template <typename T>
-using NEWarpPerspectiveFixture = WarpPerspectiveValidationFixture<Tensor, Accessor, NEWarpPerspective, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEWarpPerspectiveFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::U8)),
-                                                                                                                     framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                             datasets::BorderModes()))
-{
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEWarpPerspectiveFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::U8)),
-                                                                                                                     framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                             datasets::BorderModes()))
-{
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/YOLOLayer.cpp b/tests/validation/NEON/YOLOLayer.cpp
deleted file mode 100644
index 4806122efd..0000000000
--- a/tests/validation/NEON/YOLOLayer.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ActivationFunctionsDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/YOLOLayerFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance */
-constexpr AbsoluteTolerance<float> tolerance_f32(1e-6f);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr RelativeTolerance<float> tolerance_f16(0.01f);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-/** Floating point data sets. */
-const auto YOLODataset = combine(combine(combine(combine(framework::dataset::make("InPlace", { false, true }), framework::dataset::make("ActivationFunction",
-                                                         ActivationLayerInfo::ActivationFunction::LOGISTIC)),
-                                                 framework::dataset::make("AlphaBeta", { 0.5f, 1.f })),
-                                         framework::dataset::make("Classes", 40)),
-                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(YOLOLayer)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::U8),  // Wrong input data type
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // Invalid activation info
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // Wrong output data type
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // wrong number of classes
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // Mismatching shapes
-                                                       TensorInfo(TensorShape(17U, 16U, 6U), 1, DataType::F32),  // shrink window
-                                                       TensorInfo(TensorShape(17U, 16U, 7U), 1, DataType::F32),  // channels not multiple of (num_classes + 5)
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // Valid
-                                                     }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::U16),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 11U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 7U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                     })),
-               framework::dataset::make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                     })),
-               framework::dataset::make("Numclasses", { 1, 1, 1, 0, 1, 1, 1, 1
-                                                     })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, true})),
-               input_info, output_info, act_info, num_classes, expected)
-{
-    ARM_COMPUTE_EXPECT(bool(NEYOLOLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), act_info, num_classes)) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-template <typename T>
-using NEYOLOLayerFixture = YOLOValidationFixture<Tensor, Accessor, NEYOLOLayer, T>;
-
-TEST_SUITE(Float)
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEYOLOLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallYOLOShapes(), YOLODataset), framework::dataset::make("DataType",
-                                                                                                       DataType::F32)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEYOLOLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeYOLOShapes(), YOLODataset), framework::dataset::make("DataType",
-                                                                                                     DataType::F32)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END() // FP32
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEYOLOLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallYOLOShapes(), YOLODataset), framework::dataset::make("DataType",
-                                                                                                      DataType::F16)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEYOLOLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeYOLOShapes(), YOLODataset), framework::dataset::make("DataType",
-                                                                                                    DataType::F16)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16);
-}
-TEST_SUITE_END() // FP16
-#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-TEST_SUITE_END() // Float
-
-TEST_SUITE_END() // YOLOLayer
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute