146 files changed, 8146 insertions, 9183 deletions
diff --git a/tests/validation/NEON/AbsoluteDifference.cpp b/tests/validation/NEON/AbsoluteDifference.cpp
deleted file mode 100644
index a2debccc91..0000000000
--- a/tests/validation/NEON/AbsoluteDifference.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/AbsoluteDifferenceFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Input data sets **/
-const auto AbsoluteDifferenceU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
-                                                 DataType::U8));
-const auto AbsoluteDifferenceS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
-                                                  framework::dataset::make("DataType", DataType::S16));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(AbsoluteDifference)
-
-template <typename T>
-using NEAbsoluteDifferenceFixture = AbsoluteDifferenceValidationFixture<Tensor, Accessor, NEAbsoluteDifference, T>;
-
-TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AbsoluteDifferenceU8Dataset),
-               shape, data_type0, data_type1, output_data_type)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, data_type0);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, data_type1);
-    Tensor dst      = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAbsoluteDifference abs_diff;
-    abs_diff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsoluteDifferenceFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), AbsoluteDifferenceU8Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAbsoluteDifferenceFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AbsoluteDifferenceU8Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AbsoluteDifferenceS16Dataset),
-               shape, data_type0, data_type1, output_data_type)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, data_type0);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, data_type1);
-    Tensor dst      = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAbsoluteDifference abs_diff;
-    abs_diff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsoluteDifferenceFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), AbsoluteDifferenceS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAbsoluteDifferenceFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AbsoluteDifferenceS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // S16
-
-TEST_SUITE_END() // AbsoluteDifference
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Accumulate.cpp b/tests/validation/NEON/Accumulate.cpp
deleted file mode 100644
index 6176eb08fb..0000000000
--- a/tests/validation/NEON/Accumulate.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/AccumulateFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance value for comparing reference's output against implementation's output for floating point data types */
-constexpr AbsoluteTolerance<float> tolerance(1.0f);
-/** Input data sets **/
-const auto AccumulateU8Dataset  = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8));
-const auto AccumulateS16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16));
-} // namespace
-TEST_SUITE(NEON)
-TEST_SUITE(Accumulate)
-
-TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AccumulateS16Dataset),
-               shape, data_type, output_data_type)
-{
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAccumulate accum;
-    accum.configure(&ref_src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-template <typename T1>
-using NEAccumulateFixture = AccumulateValidationFixture<Tensor, Accessor, NEAccumulate, T1, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAccumulateFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAccumulateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-
-TEST_SUITE_END() // U8
-TEST_SUITE_END() // Accumulate
-
-TEST_SUITE(AccumulateWeighted)
-
-TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AccumulateU8Dataset),
-               shape, data_type, output_data_type)
-{
-    // Generate a random alpha value
-    std::mt19937                     gen(library->seed());
-    std::uniform_real_distribution<> float_dist(0, 1);
-    const float                      alpha = float_dist(gen);
-
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAccumulateWeighted accum_weight;
-    accum_weight.configure(&ref_src, alpha, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-template <typename T1>
-using NEAccumulateWeightedFixture = AccumulateWeightedValidationFixture<Tensor, Accessor, NEAccumulateWeighted, T1, uint8_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAccumulateWeightedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), AccumulateU8Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAccumulateWeightedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AccumulateU8Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-
-TEST_SUITE_END() // U8
-TEST_SUITE_END() // AccumulateWeighted
-
-TEST_SUITE(AccumulateSquared)
-
-TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AccumulateS16Dataset),
-               shape, data_type, output_data_type)
-{
-    // Generate a random shift value
-    std::mt19937                            gen(library->seed());
-    std::uniform_int_distribution<uint32_t> int_dist(0, 15);
-    const uint32_t                          shift = int_dist(gen);
-
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAccumulateSquared accum_square;
-    accum_square.configure(&ref_src, shift, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-template <typename T1>
-using NEAccumulateSquaredFixture = AccumulateSquaredValidationFixture<Tensor, Accessor, NEAccumulateSquared, T1, int16_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEAccumulateSquaredFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEAccumulateSquaredFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), AccumulateS16Dataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance);
-}
-
-TEST_SUITE_END() // U8
-TEST_SUITE_END() // AccumulateSquared
-
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 063bfaa2cd..73f5de68ac 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,10 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/RuntimeContext.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuActivationKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ActivationFunctionsDataset.h"
@@ -36,6 +40,9 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ActivationLayerFixture.h"
 
+#include "arm_compute/Acl.hpp"
+#include "support/AclRequires.h"
+
 namespace arm_compute
 {
 namespace test
@@ -44,6 +51,8 @@ namespace validation
 {
 namespace
 {
+RelativeTolerance<float> tolerance_float_sqrt(0.0001f);
+
 /** Define relative tolerance of the activation layer.
  *
  * @param[in] data_type  The data type used.
@@ -56,17 +65,35 @@ RelativeTolerance<float> relative_tolerance(DataType data_type, ActivationLayerI
     switch(activation)
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
         case ActivationLayerInfo::ActivationFunction::ELU:
         case ActivationLayerInfo::ActivationFunction::SQRT:
         case ActivationLayerInfo::ActivationFunction::TANH:
+        case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+        case ActivationLayerInfo::ActivationFunction::SWISH:
+        case ActivationLayerInfo::ActivationFunction::GELU:
             switch(data_type)
             {
                 case DataType::F16:
+#if defined(ENABLE_SVE)
+                    return RelativeTolerance<float>(0.25f);
+#else  // !defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.1f);
+#endif // defined(ENABLE_SVE)
                 default:
                     return RelativeTolerance<float>(0.05f);
             }
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+            switch(data_type)
+            {
+                case DataType::F16:
+#if defined(ENABLE_SVE)
+                    return RelativeTolerance<float>(0.9f);
+#else  // !defined(ENABLE_SVE)
+                    return RelativeTolerance<float>(0.01f);
+#endif // defined(ENABLE_SVE)
+                default:
+                    return RelativeTolerance<float>(0.00001f);
+            }
         default:
             return RelativeTolerance<float>(0.f);
     }
@@ -84,13 +111,30 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
     switch(activation)
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
         case ActivationLayerInfo::ActivationFunction::SQRT:
         case ActivationLayerInfo::ActivationFunction::TANH:
+        case ActivationLayerInfo::ActivationFunction::SWISH:
+        case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
             switch(data_type)
             {
                 case DataType::F16:
+#if defined(ENABLE_SVE)
+                    return AbsoluteTolerance<float>(0.25f);
+#else  // !defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.01f);
+#endif // defined(ENABLE_SVE)
+                default:
+                    return AbsoluteTolerance<float>(0.00001f);
+            }
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+            switch(data_type)
+            {
+                case DataType::F16:
+#if defined(ENABLE_SVE)
+                    return AbsoluteTolerance<float>(0.9f);
+#else  // !defined(ENABLE_SVE)
+                    return AbsoluteTolerance<float>(0.01f);
+#endif // defined(ENABLE_SVE)
                 default:
                     return AbsoluteTolerance<float>(0.00001f);
             }
@@ -99,12 +143,27 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
     }
 }
 
-/** Tolerance for quantized asymmetric operations */
-#if defined(__aarch64__)
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
-#else  // defined(__aarch64__)
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
-#endif // defined(__aarch64__)
+/** Define absolute tolerance of the activation layer for qasymm8.
+ *
+ * @param[in] activation The activation function used.
+ *
+ * @return Absolute tolerance depending on the activation function.
+ */
+AbsoluteTolerance<uint8_t> tolerance_qasymm8(ActivationLayerInfo::ActivationFunction activation)
+{
+    switch(activation)
+    {
+        case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+        case ActivationLayerInfo::ActivationFunction::SQRT:
+        case ActivationLayerInfo::ActivationFunction::TANH:
+        case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+        case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+            return AbsoluteTolerance<uint8_t>(1);
+        default:
+            return AbsoluteTolerance<uint8_t>(0);
+    }
+}
 
 constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(1);
 
@@ -117,55 +176,93 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto NeonActivationFunctionsDataset = concat(datasets::ActivationFunctions(), framework::dataset::make("ActivationFunction", ActivationLayerInfo::ActivationFunction::HARD_SWISH));
+const auto NeonActivationFunctionsDataset = concat(datasets::ActivationFunctions(),
+                                                   framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::SWISH }));
 
 /** Input data sets. */
 const auto ActivationDataset = combine(combine(framework::dataset::make("InPlace", { false, true }), NeonActivationFunctionsDataset), framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+
+template <typename T, ARM_COMPUTE_REQUIRES_TA(arm_compute::utils::traits::is_floating_point<T>::value)>
+void test_float_sqrt_boundary_value()
+{
+    constexpr auto vector_size = uint32_t{ 16 };
+
+    auto data_type = DataType::F32;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    data_type = std::is_same<T, half>::value ? DataType::F16 : data_type;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+    const auto boundary_value_vector = std::vector<T>
+    {
+        std::numeric_limits<T>::min(),
+        T(0),
+        std::numeric_limits<T>::epsilon(),
+        std::numeric_limits<T>::max(),
+    };
+
+    // the following size ensures that the whole logic (vector + left-over) to be tested
+    // using all boundary values iff boundary_value_vecotr.size() is smaller than vector_size.
+    auto shape = TensorShape{ vector_size + boundary_value_vector.size() };
+    auto info  = ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::SQRT };
+    auto src   = create_tensor<Tensor>(shape, data_type);
+
+    auto act = NEActivationLayer{};
+    act.configure(&src, nullptr, info);
+    src.allocator()->allocate();
+    library->fill_static_values(Accessor(src), boundary_value_vector);
+    act.run();
+
+    auto reference_src = SimpleTensor<T> { shape, data_type };
+    library->fill_static_values(reference_src, boundary_value_vector);
+    auto reference_dst = reference::activation_layer<T>(reference_src, info);
+
+    validate(Accessor(src), reference_dst, tolerance_float_sqrt);
+}
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(ActivationLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), CNNDataTypes), framework::dataset::make("InPlace", { false, true })),
-               shape, data_type, in_place)
+/** Test case for memory injection in @ref cpu::CpuWinogradConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(ActivationAPI, framework::DatasetMode::ALL)
 {
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type, 1);
-    Tensor dst = create_tensor<Tensor>(shape, data_type, 1);
+    acl::StatusCode err = acl::StatusCode::Success;
 
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    // Create context & Queue
+    acl::Context ctx(acl::Target::Cpu, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
 
-    // Create context
-    RuntimeContext ctx;
+    acl::Queue queue(ctx, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
 
-    // Create and configure function
-    NEActivationLayer act_layer(&ctx);
+    // Create activation operator
+    acl::TensorDescriptor src_info({ 2, 3 }, acl::DataType::Float32);
+    acl::TensorDescriptor dst_info({ 2, 3 }, acl::DataType::Float32);
+    acl::ActivationDesc   desc{ AclRelu, 6.f, 0.f, false };
 
-    if(in_place)
-    {
-        act_layer.configure(&src, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS));
-    }
-    else
-    {
-        act_layer.configure(&src, &dst, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS));
-    }
+    acl::Activation act(ctx, src_info, dst_info, desc, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
 
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
+    // Create tensors and feed
+    acl::Tensor src(ctx, src_info, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+    acl::Tensor dst(ctx, dst_info, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
 
-    if(!in_place)
-    {
-        validate(dst.info()->valid_region(), valid_region);
-    }
+    acl::TensorPack pack(ctx);
+    err = pack.add(src, ACL_SRC);
+    err = pack.add(dst, ACL_DST);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
 
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    if(!in_place)
-    {
-        validate(dst.info()->padding(), PaddingSize());
-    }
+    // Execute operator
+    err = act.run(queue, pack);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
 }
 
 // *INDENT-OFF*
@@ -189,6 +286,49 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     bool is_valid = bool(NEActivationLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), act_info));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QSYMM16
+                                                            })),
+                combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                        framework::dataset::make("DataType", { DataType::F32,
+                                                               DataType::F16,
+                                                             }))),
+                combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                        framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                               DataType::QASYMM8_SIGNED,
+                                                               DataType::QSYMM16
+                                                             }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{data_type, CPUModel::GENERIC, cpu_isa,ActivationLayerInfo::ActivationFunction::BOUNDED_RELU}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation";
+    if( data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)
+    {
+#ifdef __aarch64__
+        expected = "neon_q8_activation_lut";
+#else  // __aarch64__
+        expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation";
+#endif // __aarch64__
+    }
+    std::string actual   = selected_impl->name;
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
@@ -198,6 +338,10 @@ using NEActivationLayerFixture = ActivationValidationFixture<Tensor, Accessor, N
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
+TEST_CASE(SqrtBoundaryValue, framework::DatasetMode::ALL)
+{
+    test_float_sqrt_boundary_value<half>();
+}
 FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ActivationDataset),
                                                                                                       framework::dataset::make("DataType",
                                                                                                               DataType::F16)))
@@ -205,10 +349,14 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<half>, framework::Data
     // Validate output
     validate(Accessor(_target), _reference, relative_tolerance(_data_type, _function), 0.f, absolute_tolerance(_data_type, _function));
 }
-TEST_SUITE_END()
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
+TEST_CASE(SqrtBoundaryValue, framework::DatasetMode::ALL)
+{
+    test_float_sqrt_boundary_value<float>();
+}
 FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ActivationDataset), framework::dataset::make("DataType",
                                                                                                        DataType::F32)))
 
@@ -223,12 +371,15 @@ template <typename T>
 using NEActivationLayerQuantizedFixture = ActivationValidationQuantizedFixture<Tensor, Accessor, NEActivationLayer, T>;
 
 /** Input data sets. */
-const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-                                                                                                  ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                                  ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                                  ActivationLayerInfo::ActivationFunction::LOGISTIC,
-                                                                                                  ActivationLayerInfo::ActivationFunction::TANH
-                                                                                                });
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction",
+{
+    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::RELU,
+    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
+});
 
 const auto QuantizedActivationDataset = combine(combine(framework::dataset::make("InPlace", { false }),
                                                         concat(QuantizedActivationFunctionsDataset, framework::dataset::make("ActivationFunction", ActivationLayerInfo::ActivationFunction::HARD_SWISH))),
@@ -242,7 +393,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<uint8_t>, fra
                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.1f, 128.0f) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8(_function));
 }
 TEST_SUITE_END() // QASYMM8
 
@@ -253,14 +404,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<int8_t>, fram
                                                                                                                  framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10.0f) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8(_function));
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 /** Input data sets. */
-const auto Int16QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction", { ActivationLayerInfo::ActivationFunction::LOGISTIC,
-                                                                                                       ActivationLayerInfo::ActivationFunction::TANH
-                                                                                                     });
+const auto Int16QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction",
+{
+    ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+});
 const auto Int16QuantizedActivationDataset = combine(combine(framework::dataset::make("InPlace", { false }), Int16QuantizedActivationFunctionsDataset),
                                                      framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
 
@@ -277,7 +431,7 @@ TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // ActivationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/AddMulAdd.cpp b/tests/validation/NEON/AddMulAdd.cpp
new file mode 100644
index 0000000000..77e3d80fe6
--- /dev/null
+++ b/tests/validation/NEON/AddMulAdd.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/AddMulAddFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f);     /**< Tolerance for floating point tests */
+const AbsoluteTolerance<half>      tolerance_fp16(half(0.1f)); /**< Tolerance for 16-bit floating point tests */
+constexpr AbsoluteTolerance<float> tolerance_quant(1);         /**< Tolerance for quantized tests */
+
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+
+    // Boundaries are aligned with Quantized Data ranges -- DOUBLE check before changing
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, -2.f)
+});
+
+// QASYMM8 test quantizations
+const auto qasymm8_input1_qinfo_set = framework::dataset::make("Input1QInfo", { QuantizationInfo(0.1, 10) }); // Representable Range: [-1, 24.5]
+const auto qasymm8_input2_qinfo_set = framework::dataset::make("Input2QInfo", { QuantizationInfo(0.2, 60) }); // Representable Range: [-12, 39]
+const auto qasymm8_bn_mul_qinfo_set = framework::dataset::make("BnMulInfo", { QuantizationInfo(0.001, 55) }); // Representable Range: [-0.11, 0.2]
+const auto qasymm8_bn_add_qinfo_set = framework::dataset::make("BnAddInfo", { QuantizationInfo(0.02, 20) });  // Representable Range: [-0.4, 4.7]
+
+// Representable Range: [-9.36, 51.84], Expected F32 range: [-13, 63.5], leaving some space for saturation
+const auto qasymm8_add_output_qinfo_set = framework::dataset::make("AddOutputInfo", { QuantizationInfo(0.24, 39) });
+
+// Representable Range: [-4.8, 10.5], Expected FP32 range: [-6.985, 12.7], leaving some space for saturation
+// This range also makes sense with the activation boundaries above, i.e. [-2, 8] for LU_BOUNDED_RELU and [0, 6] for BOUNDED_RELU
+const auto qasymm8_final_output_qinfo_set = framework::dataset::make("FinalOutputInfo", { QuantizationInfo(0.06, 80) });
+
+// QASYMM8_SIGNED test quantizations
+const auto qasymm8_signed_input1_qinfo_set = framework::dataset::make("Input1QInfo", { QuantizationInfo(0.1, 10) });  // Representable Range: [-13.8, 11.7]
+const auto qasymm8_signed_input2_qinfo_set = framework::dataset::make("Input2QInfo", { QuantizationInfo(0.2, -60) }); // Representable Range: [-13.6, 39.4]
+const auto qasymm8_signed_bn_mul_qinfo_set = framework::dataset::make("BnMulInfo", { QuantizationInfo(0.001, 55) });  // Representable Range: [-0.183, 0.072]
+const auto qasymm8_signed_bn_add_qinfo_set = framework::dataset::make("BnAddInfo", { QuantizationInfo(0.4, -120) });  // Representable Range: [-0.32, 9.08]
+
+// Representable Range: [-21.36, 39.84], Expected F32 range: [-27.4, 51.1], leaving some space for saturation
+const auto qasymm8_signed_add_output_qinfo_set = framework::dataset::make("AddOutputInfo", { QuantizationInfo(0.24, -39) });
+
+// Representable Range: [-4.8, 10.5], Expected FP32 range: [-9.6713, 14.0942], leaving some space for saturation
+// This range also makes sense with the activation boundaries above, i.e. [-2, 8] for LU_BOUNDED_RELU and [0, 6] for BOUNDED_RELU
+const auto qasymm8_signed_final_output_qinfo_set = framework::dataset::make("FinalOutputInfo", { QuantizationInfo(0.06, -48) });
+
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(AddMulAdd)
+
+template <typename T>
+using NEAddMulAddFloatFixture = AddMulAddFloatValidationFixture<Tensor, Accessor, NEAddMulAdd, T, true>;
+
+template <typename T>
+using NEAddMulAddFloatFixtureWoIntermOut = AddMulAddFloatValidationFixture<Tensor, Accessor, NEAddMulAdd, T, false>;
+
+TEST_SUITE(Float)
+
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulAddFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                            ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+// This test is to stress the case when there is no intermediate output required (i.e. nullptr)
+FIXTURE_DATA_TEST_CASE(RunSmallWithoutIntermOutput, NEAddMulAddFloatFixtureWoIntermOut<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })))
+{
+    // Validate outputs
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulAddFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                                  framework::dataset::make("DataType", DataType::F32)),
+                                                                                                          ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+TEST_SUITE_END() // F32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(F16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulAddFloatFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                                                           ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulAddFloatFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
+                                                                                                         ActivationFunctionsDataset))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference); // Arithmetic Addition has more strict tolerance
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+TEST_SUITE_END() // F16
+#endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+TEST_SUITE_END() // Float
+
+template <typename T>
+using NEAddMulQuantizedFixture = AddMulAddQuantizedValidationFixture<Tensor, Accessor, NEAddMulAdd, T, true>;
+
+template <typename T>
+using NEAddMulAddQuantizedFixtureWoIntermOut = AddMulAddQuantizedValidationFixture<Tensor, Accessor, NEAddMulAdd, T, false>;
+
+TEST_SUITE(Quantized)
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                       ActivationFunctionsDataset),
+                                                                                                                       qasymm8_input1_qinfo_set),
+                                                                                                                       qasymm8_input2_qinfo_set),
+                                                                                                                       qasymm8_bn_mul_qinfo_set),
+                                                                                                                       qasymm8_bn_add_qinfo_set),
+                                                                                                                       qasymm8_add_output_qinfo_set),
+                                                                                                               qasymm8_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
+                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                     ActivationFunctionsDataset),
+                                                                                                                     qasymm8_input1_qinfo_set),
+                                                                                                                     qasymm8_input2_qinfo_set),
+                                                                                                                     qasymm8_bn_mul_qinfo_set),
+                                                                                                                     qasymm8_bn_add_qinfo_set),
+                                                                                                                     qasymm8_add_output_qinfo_set),
+                                                                                                             qasymm8_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAddMulQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                      ActivationFunctionsDataset),
+                                                                                                                      qasymm8_signed_input1_qinfo_set),
+                                                                                                                      qasymm8_signed_input2_qinfo_set),
+                                                                                                                      qasymm8_signed_bn_mul_qinfo_set),
+                                                                                                                      qasymm8_signed_bn_add_qinfo_set),
+                                                                                                                      qasymm8_signed_add_output_qinfo_set),
+                                                                                                              qasymm8_signed_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEAddMulQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                    ActivationFunctionsDataset),
+                                                                                                                    qasymm8_signed_input1_qinfo_set),
+                                                                                                                    qasymm8_signed_input2_qinfo_set),
+                                                                                                                    qasymm8_signed_bn_mul_qinfo_set),
+                                                                                                                    qasymm8_signed_bn_add_qinfo_set),
+                                                                                                                    qasymm8_signed_add_output_qinfo_set),
+                                                                                                            qasymm8_signed_final_output_qinfo_set))
+{
+    // Validate outputs
+    validate(Accessor(_interm_target), _interm_reference, tolerance_quant);
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE_END() // AddMulAdd
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif // __aarch64__
diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp
index e7ab4a4bbf..91b8128dea 100644
--- a/tests/validation/NEON/ArgMinMax.cpp
+++ b/tests/validation/NEON/ArgMinMax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,27 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+const auto OpsDataset   = framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX });
+const auto AxisDataset  = framework::dataset::make("Axis", { 0, 1, 2, 3 });
+const auto QInfoDataset = framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) });
+
+const auto ArgMinMaxSmallDatasetAxis0 = framework::dataset::make("Shape",
+{
+    TensorShape{ 1U, 5U },
+    TensorShape{ 2U, 3U },
+    TensorShape{ 1U },
+    TensorShape{ 3U },
+    TensorShape{ 2U },
+    TensorShape{ 5U },
+    TensorShape{ 17U },
+    TensorShape{ 15U, 2U },
+});
+using ArgMinMaxSmallDataset = datasets::Small4DShapes;
+using ArgMinMaxLargeDataset = datasets::Large4DShapes;
+}
+
 TEST_SUITE(NEON)
 TEST_SUITE(ArgMinMax)
 
@@ -70,23 +91,50 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-template <typename T>
-using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+template <typename T1, typename T2>
+using NEArgMinMaxValidationFixture = ArgMinMaxValidationFixture<Tensor, Accessor, NEArgMinMaxLayer, T1, T2>;
+
+using NEArgMinMaxValidationFixture_S32_S32 = NEArgMinMaxValidationFixture<int32_t, int32_t>;
+using NEArgMinMaxValidationFixture_F16_S32 = NEArgMinMaxValidationFixture<half, int32_t>;
+using NEArgMinMaxValidationFixture_F32_S32 = NEArgMinMaxValidationFixture<float, int32_t>;
+#ifdef __aarch64__
+using NEArgMinMaxValidationFixture_F32_S64 = NEArgMinMaxValidationFixture<float, int64_t>;
+#endif // __aarch64__
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxValidationFixture<int32_t>,
+FIXTURE_DATA_TEST_CASE(RunSmallAxis0,
+                       NEArgMinMaxValidationFixture_S32_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDatasetAxis0,
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       framework::dataset::make("Axis", { 0 })),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEArgMinMaxValidationFixture_S32_S32,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxValidationFixture<int32_t>,
+                       NEArgMinMaxValidationFixture_S32_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::S32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::S32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -97,18 +145,26 @@ TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxValidationFixture<half>,
+                       NEArgMinMaxValidationFixture_F16_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F16)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxValidationFixture<half>,
+                       NEArgMinMaxValidationFixture_F16_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F16)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -118,18 +174,41 @@ TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxValidationFixture<float>,
+                       NEArgMinMaxValidationFixture_F32_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
+#ifdef __aarch64__
+FIXTURE_DATA_TEST_CASE(RunSmall_F32_S64,
+                       NEArgMinMaxValidationFixture_F32_S64,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S64)),
+                                       AxisDataset),
+                               OpsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+#endif // __aarch64__
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxValidationFixture<float>,
+                       NEArgMinMaxValidationFixture_F32_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })))
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                       framework::dataset::make("DataTypeIn", DataType::F32)),
+                                               framework::dataset::make("DataTypeOut", DataType::S32)),
+                                       AxisDataset),
+                               OpsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -137,27 +216,35 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
-template <typename T>
-using NEArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<Tensor, Accessor, NEArgMinMaxLayer, T>;
+template <typename T1, typename T2>
+using NEArgMinMaxQuantizedValidationFixture = ArgMinMaxValidationQuantizedFixture<Tensor, Accessor, NEArgMinMaxLayer, T1, T2>;
+
+using NEArgMinMaxQuantizedValidationFixture_U8_S32 = NEArgMinMaxQuantizedValidationFixture<uint8_t, int32_t>;
+using NEArgMinMaxQuantizedValidationFixture_S8_S32 = NEArgMinMaxQuantizedValidationFixture<int8_t, int32_t>;
 
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_U8_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxQuantizedValidationFixture<uint8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_U8_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -166,29 +253,34 @@ TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
-                       NEArgMinMaxQuantizedValidationFixture<int8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_S8_S32,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 127.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxSmallDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge,
-                       NEArgMinMaxQuantizedValidationFixture<int8_t>,
+                       NEArgMinMaxQuantizedValidationFixture_S8_S32,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 127.f, 20) })))
+                       combine(combine(combine(combine(combine(ArgMinMaxLargeDataset(),
+                                                               framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                               AxisDataset),
+                                       OpsDataset),
+                               QInfoDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // ArgMinMax
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
index d1b6ce24dc..535c3e634e 100644
--- a/tests/validation/NEON/ArithmeticAddition.cpp
+++ b/tests/validation/NEON/ArithmeticAddition.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ConvertPolicyDataset.h"
@@ -43,27 +46,13 @@ namespace validation
 {
 namespace
 {
-#ifndef __aarch64__
+#if !defined(__aarch64__) || defined(ENABLE_SVE)
 constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
-#endif                                                 //__aarch64__
-
-/** Input data sets **/
-const auto ArithmeticAdditionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
-                                                 DataType::U8));
-const auto ArithmeticAdditionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
-                                                  framework::dataset::make("DataType", DataType::S16));
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-const auto ArithmeticAdditionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
-                                                   framework::dataset::make("DataType", DataType::F16));
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-const auto ArithmeticAdditionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
-                                                   framework::dataset::make("DataType", DataType::F32));
-const auto ArithmeticAdditionQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                      framework::dataset::make("DataType", DataType::QASYMM8));
-const auto ArithmeticAdditionQASYMM8SIGNEDDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                      framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-const auto ArithmeticAdditionQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16), framework::dataset::make("DataType", DataType::QSYMM16)),
-                                                      framework::dataset::make("DataType", DataType::QSYMM16));
+#else                                                  // !defined(__aarch64__) || defined(ENABLE_SVE)
+constexpr AbsoluteTolerance<float> tolerance_quant(0);
+#endif                                                 // !defined(__aarch64__) || defined(ENABLE_SVE)
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -75,25 +64,22 @@ using NEArithmeticAdditionFixture = ArithmeticAdditionValidationFixture<Tensor,
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8), // Unsupported broadcast
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),// Mismatching shapes
                                                       }),
-               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(1U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, false, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     Status s = NEArithmeticAddition::validate(&input1_info.clone()->set_is_resizable(false),
@@ -102,13 +88,88 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                               ConvertPolicy::WRAP);
     ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
+                combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::U8,
+                                                              DataType::S16,
+                                                              DataType::S32,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QSYMM16
+                                                            })),
+                        framework::dataset::make("CanUseFixedpoint", {true, false})),
+                combine(combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                        framework::dataset::make("DataType", { DataType::F32,
+                                                               DataType::F16,
+                                                               DataType::U8,
+                                                               DataType::S16,
+                                                               DataType::S32
+                                                             })),
+                        framework::dataset::make("CanUseFixedpoint", {true, false}))),
+                combine(combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                        framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                               DataType::QASYMM8_SIGNED,
+                                                               DataType::QSYMM16
+                                                             })),
+                        framework::dataset::make("CanUseFixedpoint", {true, false}))),
+               cpu_ext, data_type, can_use_fixedpoint)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{data_type, cpu_isa, can_use_fixedpoint}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    bool qasymm8_any = (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED);
+
+    std::string expected;
+    if(qasymm8_any && can_use_fixedpoint)
+    {
+        expected = "neon_" + cpu_impl_dt(data_type) + "_add_fixedpoint";
+    }
+    else
+    {
+        expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_add";
+    }
+
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
+TEST_CASE(NoPaddingAdded, framework::DatasetMode::PRECOMMIT)
+{
+    // NEArithmeticAddition doesn't use padding, so make sure this is the case.
+    Tensor input1 = create_tensor<Tensor>(TensorShape(15U, 15U), DataType::F32);
+    Tensor input2 = create_tensor<Tensor>(TensorShape(15U, 1U), DataType::F32);
+    Tensor output = create_tensor<Tensor>(TensorShape(15U, 15U), DataType::F32);
+
+    NEArithmeticAddition add;
+    add.configure(&input1, &input2, &output, ConvertPolicy::WRAP);
+
+    // Validate padding is zero
+    validate(input1.info()->padding(), PaddingSize());
+    validate(input2.info()->padding(), PaddingSize());
+    validate(output.info()->padding(), PaddingSize());
+}
+
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionU8Dataset),
-                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::U8)),
+                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                  OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -116,27 +177,43 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionS16Dataset),
-                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::S16)),
+                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                  OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticAdditionS16Dataset),
-                                                                                                                framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                        DataType::S16)),
+                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S16
+
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                    DataType::S32)),
+                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                            OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP16Dataset),
-                                                                                                         framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                 framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                         OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -145,15 +222,19 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP32Dataset),
-                                                                                                                framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                        DataType::F32)),
+                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticAdditionFP32Dataset),
-                                                                                                              framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                      DataType::F32)),
+                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                              OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -162,17 +243,19 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticAdditionFixture<float>, framework::
 template <typename T>
 using NEArithmeticAdditionBroadcastFixture = ArithmeticAdditionBroadcastValidationFixture<Tensor, Accessor, NEArithmeticAddition, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapesBroadcast(),
-                       ArithmeticAdditionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
-                       ArithmeticAdditionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticAdditionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -183,23 +266,23 @@ TEST_SUITE_END() // Float
 template <typename T>
 using NEArithmeticAdditionQuantizedFixture = ArithmeticAdditionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticAddition, T>;
 
+template <typename T>
+using NEArithmeticAdditionQuantizedBroadcastFixture = ArithmeticAdditionValidationQuantizedBroadcastFixture<Tensor, Accessor, NEArithmeticAddition, T>;
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionQASYMM8Dataset),
-                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
-#ifdef __aarch64__
-    validate(Accessor(_target), _reference);
-#else  //__aarch64__
     validate(Accessor(_target), _reference, tolerance_quant);
-#endif //__aarch64__
 }
 TEST_SUITE_END() // QASYMM8
 
@@ -207,18 +290,27 @@ TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<int8_t>,
                        framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionQASYMM8SIGNEDDataset),
-                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })))
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })),
+                               OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionQuantizedBroadcastFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(
+                           datasets::SmallShapesBroadcast(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                       framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
+                       framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
-#ifdef __aarch64__
-    validate(Accessor(_target), _reference);
-#else  //__aarch64__
     validate(Accessor(_target), _reference, tolerance_quant);
-#endif //__aarch64__
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 
@@ -226,24 +318,21 @@ TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<int16_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionQSYMM16Dataset),
-                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QSYMM16)),
+                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
-#ifdef __aarch64__
-    validate(Accessor(_target), _reference);
-#else  //__aarch64__
     validate(Accessor(_target), _reference, tolerance_quant);
-#endif //__aarch64__
 }
 TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // ArithmeticAddition
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp
index 420d61d1ee..8886ca2db5 100644
--- a/tests/validation/NEON/ArithmeticSubtraction.cpp
+++ b/tests/validation/NEON/ArithmeticSubtraction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,44 +50,21 @@ constexpr AbsoluteTolerance<float> tolerance_qasymm8(1); /**< Tolerance value fo
 #endif                                                     //__aarch64__
 constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 
-/** Input data sets **/
-const auto ArithmeticSubtractionQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8),
-                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                         framework::dataset::make("DataType", DataType::QASYMM8));
-
-const auto ArithmeticSubtractionQASYMM8SIGNEDDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED),
-                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-
-const auto ArithmeticSubtractionQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16),
-                                                                 framework::dataset::make("DataType", DataType::QSYMM16)),
-                                                         framework::dataset::make("DataType", DataType::QSYMM16));
-
-const auto ArithmeticSubtractionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8),
-                                                            framework::dataset::make("DataType", DataType::U8)),
-                                                    framework::dataset::make("DataType", DataType::U8));
-
-const auto ArithmeticSubtractionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }),
-                                                             framework::dataset::make("DataType", DataType::S16)),
-                                                     framework::dataset::make("DataType", DataType::S16));
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-const auto ArithmeticSubtractionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16),
-                                                              framework::dataset::make("DataType", DataType::F16)),
-                                                      framework::dataset::make("DataType", DataType::F16));
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-const auto ArithmeticSubtractionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32),
-                                                              framework::dataset::make("DataType", DataType::F32)),
-                                                      framework::dataset::make("DataType", DataType::F32));
-
+// Quantization Infomation DataSet
 const auto ArithmeticSubtractionQuantizationInfoDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(10, 120) }),
                                                                           framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(20, 110) })),
                                                                   framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(15, 125) }));
 const auto ArithmeticSubtractionQuantizationInfoSignedDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.5f, 10) }),
                                                                                 framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.5f, 20) })),
                                                                         framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.5f, 50) }));
+const auto ArithmeticSubtractionQuantizationInfoSignedInPlaceDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.8f, 10) }),
+                                                                                       framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.8f, 10) })),
+                                                                               framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.8f, 10) }));
 const auto ArithmeticSubtractionQuantizationInfoSymmetric = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.3f, 0) }),
                                                                             framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.7f, 0) })),
                                                                     framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.2f, 0) }));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -99,33 +76,31 @@ using NEArithmeticSubtractionFixture = ArithmeticSubtractionValidationFixture<Te
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-        framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+        framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                  TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::QASYMM8), // Mismatching types
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), // Invalid convert policy
         }),
-        framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+        framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
         })),
-        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
-                                                TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
+                                                TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
         })),
-        framework::dataset::make("ConvertPolicy",{ ConvertPolicy::WRAP,
-                                                ConvertPolicy::SATURATE,
-                                                ConvertPolicy::SATURATE,
-                                                ConvertPolicy::WRAP,
-                                                ConvertPolicy::WRAP,
+        framework::dataset::make("ConvertPolicy",{ ConvertPolicy::SATURATE,
+                                                   ConvertPolicy::SATURATE,
+                                                   ConvertPolicy::WRAP,
+                                                   ConvertPolicy::WRAP,
+                                                   ConvertPolicy::WRAP,
         })),
-        framework::dataset::make("Expected", { true, true, false, false, false, false})),
+        framework::dataset::make("Expected", { true, false, false, false, false})),
         input1_info, input2_info, output_info, policy, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NEArithmeticSubtraction::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), policy)) == expected, framework::LogLevel::ERRORS);
@@ -133,24 +108,81 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+TEST_SUITE(InPlaceValidate)
+TEST_CASE(SingleTensor, framework::DatasetMode::ALL)
+{
+    const auto random_shape       = TensorShape{ 9, 9 };
+    const auto single_tensor_info = TensorInfo{ random_shape, 1, DataType::F32 };
+
+    Status result = NEArithmeticSubtraction::validate(&single_tensor_info, &single_tensor_info, &single_tensor_info, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(ValidBroadCast, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = NEArithmeticSubtraction::validate(&larger_tensor_info, &smaller_tensor_info, &larger_tensor_info, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastOutput, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = NEArithmeticSubtraction::validate(&larger_tensor_info, &smaller_tensor_info, &smaller_tensor_info, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastBoth, framework::DatasetMode::ALL)
+{
+    const auto shape0 = TensorShape{ 9U, 9U };
+    const auto shape1 = TensorShape{ 9U, 1U, 2U };
+
+    const auto info0 = TensorInfo{ shape0, 1, DataType::F32 };
+    const auto info1 = TensorInfo{ shape1, 1, DataType::F32 };
+
+    Status result{};
+
+    result = NEArithmeticSubtraction::validate(&info0, &info1, &info0, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+
+    result = NEArithmeticSubtraction::validate(&info0, &info1, &info1, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // InPlaceValidate
+
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::U8)),
+                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                     OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // U8
 
-using NEArithmeticSubtractionQASYMM8Fixture       = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, uint8_t>;
-using NEArithmeticSubtractionQASYMM8SignedFixture = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
-using NEArithmeticSubtractionQSYMM16Fixture       = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int16_t>;
+using NEArithmeticSubtractionQASYMM8Fixture                = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, uint8_t>;
+using NEArithmeticSubtractionQASYMM8SignedFixture          = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
+using NEArithmeticSubtractionQASYMM8SignedBroadcastFixture = ArithmeticSubtractionValidationQuantizedBroadcastFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
+using NEArithmeticSubtractionQSYMM16Fixture                = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int16_t>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionQASYMM8Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::QASYMM8)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                                                                             ArithmeticSubtractionQuantizationInfoDataset))
+                                                                                                                     ArithmeticSubtractionQuantizationInfoDataset),
+                                                                                                             OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -158,11 +190,31 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framewor
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(
-                                                                                                                       datasets::SmallShapes(),
-                                                                                                                       ArithmeticSubtractionQASYMM8SIGNEDDataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
+                                                                                                                       datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                                                                                   ArithmeticSubtractionQuantizationInfoSignedDataset))
+                                                                                                                   ArithmeticSubtractionQuantizationInfoSignedDataset),
+                                                                                                                   OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
+                           datasets::SmallShapesBroadcast(),
+                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                       ArithmeticSubtractionQuantizationInfoSignedDataset),
+                       OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
+                           datasets::TinyShapesBroadcastInplace(),
+                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                       ArithmeticSubtractionQuantizationInfoSignedInPlaceDataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -170,11 +222,12 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, fr
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
         datasets::SmallShapes(),
-        ArithmeticSubtractionQSYMM16Dataset),
+        framework::dataset::make("DataType", DataType::QSYMM16)),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                                                                             ArithmeticSubtractionQuantizationInfoSymmetric))
+                                                                                                                     ArithmeticSubtractionQuantizationInfoSymmetric),
+                                                                                                             OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -183,26 +236,52 @@ TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS16Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::S16)),
+                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                     OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS16Dataset),
-                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::S16)),
+                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                   OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S16
 
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::S32)),
+                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                     OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::S32)),
+                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                   OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
-                                                                                                            framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                    DataType::F16)),
+                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                            OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -211,15 +290,19 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP32Dataset),
-                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::F32)),
+                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                   InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticSubtractionFP32Dataset),
-                                                                                                                 framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)),
+                                                                                                                 framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                 OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -228,17 +311,19 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framewor
 template <typename T>
 using NEArithmeticSubtractionBroadcastFixture = ArithmeticSubtractionBroadcastValidationFixture<Tensor, Accessor, NEArithmeticSubtraction, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapesBroadcast(),
-                       ArithmeticSubtractionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
-                       ArithmeticSubtractionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -247,7 +332,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ArithmeticSubtraction
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BatchConcatenateLayer.cpp b/tests/validation/NEON/BatchConcatenateLayer.cpp
index 2c0ebc3d54..6eafe82f8a 100644
--- a/tests/validation/NEON/BatchConcatenateLayer.cpp
+++ b/tests/validation/NEON/BatchConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,7 +70,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {
diff --git a/tests/validation/NEON/BatchNormalizationLayer.cpp b/tests/validation/NEON/BatchNormalizationLayer.cpp
index 58b7474b41..50eaf0c667 100644
--- a/tests/validation/NEON/BatchNormalizationLayer.cpp
+++ b/tests/validation/NEON/BatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,8 +51,9 @@ namespace
 RelativeTolerance<float>           rel_tolerance_f32(0.05f);   /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr AbsoluteTolerance<float> abs_tolerance_f32(0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr AbsoluteTolerance<float> tolerance_f16(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
-#endif                                                   // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr AbsoluteTolerance<float> abs_tolerance_f16(0.015f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+#endif                                                       // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
 const auto act_infos = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
@@ -71,69 +72,34 @@ TEST_SUITE(BatchNormalizationLayer)
 template <typename T>
 using NEBatchNormalizationLayerFixture = BatchNormalizationLayerValidationFixture<Tensor, Accessor, NEBatchNormalizationLayer, T>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(),
-                                                                                   combine(framework::dataset::make("UseBeta", { false, true }), framework::dataset::make("UseGamma", { false, true }))),
-                                                                           framework::dataset::make("DataType", { DataType::F32 })),
-                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-               shape0, shape1, epsilon, use_beta, use_gamma, dt, data_layout)
-{
-    TensorShape src_dst_shapes = shape0;
-    if(data_layout == DataLayout::NHWC)
-    {
-        permute(src_dst_shapes, PermutationVector(2U, 0U, 1U));
-    }
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    Tensor dst   = create_tensor<Tensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    Tensor mean  = create_tensor<Tensor>(shape1, dt, 1);
-    Tensor var   = create_tensor<Tensor>(shape1, dt, 1);
-    Tensor beta  = create_tensor<Tensor>(shape1, dt, 1);
-    Tensor gamma = create_tensor<Tensor>(shape1, dt, 1);
-
-    // Create and Configure function
-    NEBatchNormalizationLayer norm;
-    Tensor                   *beta_ptr  = use_beta ? &beta : nullptr;
-    Tensor                   *gamma_ptr = use_gamma ? &gamma : nullptr;
-    norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(src_dst_shapes);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Window shrink
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching data types
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching data types
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid mean/var/beta/gamma shape
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Fused activation's a < b
                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("MVBGInfo",{ TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                     TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(2U), 1, DataType::F16),
                                                      TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(5U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                    })),
                framework::dataset::make("ActivationLayerInfo",{ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
                                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f),
                                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f),
                                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f),
                                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 2.f, 6.f),
                                                    })),
-               framework::dataset::make("Expected", { true, false, false, false, false, false})),
+               framework::dataset::make("Expected", { true, false, false, false, false})),
                input_info, output_info, mvbg_info, act_info, expected)
 {
     const auto &mean_info = mvbg_info;
@@ -183,7 +149,7 @@ FIXTURE_DATA_TEST_CASE(RandomSmall, NEBatchNormalizationLayerFixture<half>, fram
                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16, 0);
+    validate(Accessor(_target), _reference, abs_tolerance_f16, 0);
 }
 
 FIXTURE_DATA_TEST_CASE(RandomLarge, NEBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::LargeRandomBatchNormalizationLayerDataset(),
@@ -194,7 +160,7 @@ FIXTURE_DATA_TEST_CASE(RandomLarge, NEBatchNormalizationLayerFixture<half>, fram
                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16, 0);
+    validate(Accessor(_target), _reference, abs_tolerance_f16, 0);
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -254,7 +220,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // BatchNormalizationLayerFusion
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BatchToSpaceLayer.cpp b/tests/validation/NEON/BatchToSpaceLayer.cpp
index b5d684a451..8cf11b7b95 100644
--- a/tests/validation/NEON/BatchToSpaceLayer.cpp
+++ b/tests/validation/NEON/BatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,55 +49,38 @@ using NEBatchToSpaceLayerFixture = BatchToSpaceLayerValidationFixture<Tensor, Ac
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blockx > blocky
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blocky > blockx
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),     // Mismatching data types
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),     // Wrong data type block shape
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U, 4U), 1, DataType::F32), // Wrong tensor shape
-                                                     }),
-               framework::dataset::make("BlockShapeInfo",{ TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 4U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(4U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                     })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(64U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 32U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),
-                                                     })),
-               framework::dataset::make("Expected", { true, true, true, false, false, false})),
-               input_info, block_shape_info, output_info, expected)
-{
-    bool has_error = bool(NEBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), &block_shape_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false)));
-    ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
-}
-DATA_TEST_CASE(ValidateStatic, framework::DatasetMode::ALL, zip(zip(zip(zip(
+DATA_TEST_CASE(ValidateStatic, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blockx > blocky
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // blockx != blocky && blocky > blockx
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),    // Negative block shapes
-                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U, 4U), 1, DataType::F32), // Wrong tensor shape
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: blockx != blocky && blockx > blocky
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: blockx != blocky && blocky > blockx
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),     // Invalid: Mismatching data types
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 4U), 1, DataType::F32),     // Invalid: Negative block shapes
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U, 4U), 1, DataType::F32),// Unsupported tensor rank
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid output tensor shape (invalid batch dimension)
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid output tensor shape (invalid spatial dimension)
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Supported: correct tensor shape with cropping
+                                                       TensorInfo(TensorShape(16U, 8U, 2U, 16U), 1, DataType::F32),    // Invalid tensor shape with cropping
                                                      }),
-               framework::dataset::make("BlockShapeX", { 2, 4, 2, 2, 2, 2 })),
-               framework::dataset::make("BlockShapeY", { 2, 2, 4, 2, -2, 2 })),
+               framework::dataset::make("BlockShapeX", { 2, 4, 2, 2, 2, 2, 2, 2, 2, 2 })),
+               framework::dataset::make("BlockShapeY", { 2, 2, 4, 2, -2, 2, 2, 2, 2, 2 })),
+               framework::dataset::make("CropInfo", {
+                CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{}, CropInfo{3, 2, 1, 3}, CropInfo{3, 2, 1, 3}
+               })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(64U, 16U, 2U, 1U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 32U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 16U, 2U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 32U, 2U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 8U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 1U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(33U, 32U, 2U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(27, 12U, 2U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 16U, 2U, 4U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, true, false, false, false})),
-               input_info, block_shape_x, block_shape_y, output_info, expected)
+               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true, false})),
+               input_info, block_shape_x, block_shape_y, crop_info, output_info, expected)
 {
-    bool has_error = bool(NEBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), block_shape_x, block_shape_y, &output_info.clone()->set_is_resizable(false)));
+    bool has_error = bool(NEBatchToSpaceLayer::validate(&input_info.clone()->set_is_resizable(false), block_shape_x, block_shape_y, &output_info.clone()->set_is_resizable(false), crop_info));
     ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -112,6 +95,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchToSpaceLayerFixture<float>, framework::D
     // Validate output
     validate(Accessor(_target), _reference);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallWithCropping, NEBatchToSpaceLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallBatchToSpaceLayerWithCroppingDataset(), framework::dataset::make("DataType",
+                                                                                                                       DataType::F32)),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEBatchToSpaceLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeBatchToSpaceLayerDataset(), framework::dataset::make("DataType",
                                                                                                                      DataType::F32)),
                                                                                                              framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
@@ -129,6 +122,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchToSpaceLayerFixture<half>, framework::Da
     // Validate output
     validate(Accessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallWithCropping, NEBatchToSpaceLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallBatchToSpaceLayerWithCroppingDataset(), framework::dataset::make("DataType",
+                                                                                                                       DataType::F16)),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEBatchToSpaceLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeBatchToSpaceLayerDataset(), framework::dataset::make("DataType",
                                                                                                                     DataType::F16)),
                                                                                                             framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
@@ -140,7 +142,7 @@ TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE_END() // BatchToSpace
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BitwiseAnd.cpp b/tests/validation/NEON/BitwiseAnd.cpp
index 773c985838..eff649e87a 100644
--- a/tests/validation/NEON/BitwiseAnd.cpp
+++ b/tests/validation/NEON/BitwiseAnd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(BitwiseAnd)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBitwiseAnd bitwise_and;
-    bitwise_and.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src1.info()->valid_region(), valid_region);
-    validate(src2.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEBitwiseAndFixture = BitwiseAndValidationFixture<Tensor, Accessor, NEBitwiseAnd, T>;
 
@@ -81,7 +53,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBitwiseAndFixture<uint8_t>, framework::Datase
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // BitwiseAnd
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BitwiseNot.cpp b/tests/validation/NEON/BitwiseNot.cpp
index abc091ce91..73b49b6ff1 100644
--- a/tests/validation/NEON/BitwiseNot.cpp
+++ b/tests/validation/NEON/BitwiseNot.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,30 +43,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(BitwiseNot)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBitwiseNot bitwise_not;
-    bitwise_not.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEBitwiseNotFixture = BitwiseNotValidationFixture<Tensor, Accessor, NEBitwiseNot, T>;
 
@@ -77,7 +53,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBitwiseNotFixture<uint8_t>, framework::Datase
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // BitwiseNot
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BitwiseOr.cpp b/tests/validation/NEON/BitwiseOr.cpp
index 1a33443049..61d1273b37 100644
--- a/tests/validation/NEON/BitwiseOr.cpp
+++ b/tests/validation/NEON/BitwiseOr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(BitwiseOr)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBitwiseOr bitwise_or;
-    bitwise_or.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src1.info()->valid_region(), valid_region);
-    validate(src2.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEBitwiseOrFixture = BitwiseOrValidationFixture<Tensor, Accessor, NEBitwiseOr, T>;
 
@@ -81,7 +53,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBitwiseOrFixture<uint8_t>, framework::Dataset
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // BitwiseOr
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BitwiseXor.cpp b/tests/validation/NEON/BitwiseXor.cpp
index 2a8c0c97ae..727052ed6b 100644
--- a/tests/validation/NEON/BitwiseXor.cpp
+++ b/tests/validation/NEON/BitwiseXor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(BitwiseXor)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBitwiseXor bitwise_xor;
-    bitwise_xor.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src1.info()->valid_region(), valid_region);
-    validate(src2.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEBitwiseXorFixture = BitwiseXorValidationFixture<Tensor, Accessor, NEBitwiseXor, T>;
 
@@ -81,7 +53,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEBitwiseXorFixture<uint8_t>, framework::Datase
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // BitwiseXor
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/BoundingBoxTransform.cpp b/tests/validation/NEON/BoundingBoxTransform.cpp
index 3f16b4525f..2ca2434150 100644
--- a/tests/validation/NEON/BoundingBoxTransform.cpp
+++ b/tests/validation/NEON/BoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -153,7 +153,7 @@ TEST_SUITE_END() // QASYMM16
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // BBoxTransform
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Box3x3.cpp b/tests/validation/NEON/Box3x3.cpp
deleted file mode 100644
index 21bce05b41..0000000000
--- a/tests/validation/NEON/Box3x3.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Box3x3Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Box3x3)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBox3x3 box3x3;
-    box3x3.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEBox3x3Fixture = Box3x3ValidationFixture<Tensor, Accessor, NEBox3x3, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEBox3x3Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                              DataType::U8)),
-                                                                                                      datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEBox3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/CannyEdge.cpp b/tests/validation/NEON/CannyEdge.cpp
deleted file mode 100644
index 7c4cd8090f..0000000000
--- a/tests/validation/NEON/CannyEdge.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ImageFileDatasets.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/CannyEdgeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Allowed ratio of mismatches between target and reference (1.0 = 100%) */
-const float allowed_mismatch_ratio = 0.1f;
-
-const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
-                          combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), datasets::BorderModes()));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(CannyEdge)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), data), framework::dataset::make("Format", Format::U8)),
-               shape, gradient_size, normalization, border_mode, format)
-{
-    CannyEdgeParameters params = canny_edge_parameters();
-    // Convert normalisation type to integer
-    const auto norm_type = static_cast<int>(normalization) + 1;
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst = create_tensor<Tensor>(shape, data_type_from_format(format));
-    src.info()->set_format(format);
-    dst.info()->set_format(format);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create Canny edge configure function
-    NECannyEdge canny_edge;
-    canny_edge.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value);
-
-    // Validate valid region
-    validate(src.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode)));
-    validate(dst.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode)));
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(gradient_size / 2);
-    calculator.set_access_offset(-gradient_size / 2);
-    calculator.set_accessed_elements(16);
-
-    validate(src.info()->padding(), calculator.required_padding());
-    validate(dst.info()->padding(), PaddingSize{ 1 });
-}
-
-template <typename T>
-using NECannyEdgeFixture = CannyEdgeValidationFixture<Tensor, Accessor, KeyPointArray, NECannyEdge, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NECannyEdgeFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallImageFiles(), data), framework::dataset::make("Format", Format::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NECannyEdgeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeImageFiles(), data), framework::dataset::make("Format", Format::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, AbsoluteTolerance<uint8_t>(0), allowed_mismatch_ratio);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Cast.cpp b/tests/validation/NEON/Cast.cpp
index fd66be37b2..b56594546b 100644
--- a/tests/validation/NEON/Cast.cpp
+++ b/tests/validation/NEON/Cast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ConvertPolicyDataset.h"
@@ -34,7 +37,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/CastFixture.h"
-
 namespace arm_compute
 {
 namespace test
@@ -99,6 +101,11 @@ const auto CastF32toS32Dataset            = combine(framework::dataset::make("Da
 const auto CastF32toQASYMM8Dataset        = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
 const auto CastF32toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
 
+// U64
+const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32));
+
+// S64
+const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32));
 } // namespace
 
 TEST_SUITE(NEON)
@@ -106,6 +113,8 @@ TEST_SUITE(Cast)
 template <typename T>
 using NECastToU8Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint8_t>;
 template <typename T>
+using NECastToS8Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int8_t>;
+template <typename T>
 using NECastToU16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint16_t>;
 template <typename T>
 using NECastToS16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int16_t>;
@@ -114,6 +123,10 @@ using NECastToU32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, ui
 template <typename T>
 using NECastToS32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int32_t>;
 template <typename T>
+using NECastToU64Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint64_t>;
+template <typename T>
+using NECastToS64Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int64_t>;
+template <typename T>
 using NECastToF16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, half>;
 template <typename T>
 using NECastToF32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, float>;
@@ -187,8 +200,68 @@ CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, NECastToF16Fixture<float>,
 CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
 CAST_SUITE(F32_to_U8, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
 
+#ifdef __aarch64__
+// S64
+CAST_SUITE(S64_to_F32, DataType::S64, DataType::F32, NECastToF32Fixture<int64_t>, CastS64toF32Dataset, zero_tolerance)
+
+// U64
+CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, NECastToF32Fixture<uint64_t>, CastU64toF32Dataset, zero_tolerance)
+#endif // __aarch64__
+
+DATA_TEST_CASE(KernelSelectionDstFP16, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType",
+{
+    DataType::F16,
+    DataType::U8,
+    DataType::S32,
+    DataType::QASYMM8,
+    DataType::QASYMM8_SIGNED,
+})),
+cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+    const CpuCastKernel::CastKernel *selected_impl;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = true;
+
+    selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ data_type, DataType::F16, cpu_isa }, cpu::KernelSelectionType::Preferred);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_cast";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(KernelSelectionSrcFP32, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType",
+{
+    DataType::F16,
+})),
+cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ DataType::F32, data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_fp32_to_" + cpu_impl_dt(data_type) + "_cast";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE_END() // Cast
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ChannelCombine.cpp b/tests/validation/NEON/ChannelCombine.cpp
deleted file mode 100644
index fcba07f596..0000000000
--- a/tests/validation/NEON/ChannelCombine.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/MultiImage.h"
-#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ChannelCombineFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-inline void validate_configuration(const TensorShape &shape, Format format)
-{
-    const int num_planes = num_planes_from_format(format);
-
-    // Create tensors
-    MultiImage          dst     = create_multi_image<MultiImage>(shape, format);
-    std::vector<Tensor> ref_src = create_tensor_planes<Tensor>(shape, format);
-
-    // Create and configure function
-    NEChannelCombine channel_combine;
-
-    if(num_planes == 1)
-    {
-        const Tensor *tensor_extra = Format::RGBA8888 == format ? &ref_src[3] : nullptr;
-
-        channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], tensor_extra, dst.plane(0));
-    }
-    else
-    {
-        channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], &dst);
-    }
-
-    // TODO(bsgcomp): Add validation for padding and shape (COMPMID-659)
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(ChannelCombine)
-
-TEST_SUITE(Configuration)
-DATA_TEST_CASE(RGBA, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })),
-               shape, format)
-{
-    validate_configuration(shape, format);
-}
-DATA_TEST_CASE(YUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })),
-               shape, format)
-{
-    validate_configuration(shape, format);
-}
-
-DATA_TEST_CASE(YUVPlanar, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444, Format::NV12, Format::NV21 })),
-               shape, format)
-{
-    validate_configuration(shape, format);
-}
-TEST_SUITE_END() // Configuration
-
-template <typename T>
-using NEChannelCombineFixture = ChannelCombineValidationFixture<MultiImage, Tensor, Accessor, NEChannelCombine, T>;
-
-TEST_SUITE(RGBA)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // RGBA
-
-TEST_SUITE(YUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUV
-
-TEST_SUITE(YUVPlanar)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUVPlanar
-
-TEST_SUITE_END() // ChannelCombine
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ChannelExtract.cpp b/tests/validation/NEON/ChannelExtract.cpp
deleted file mode 100644
index 542cd1a193..0000000000
--- a/tests/validation/NEON/ChannelExtract.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/MultiImage.h"
-#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ConvertPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ChannelExtractFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-// Input data sets
-const auto ChannelExtractRGBADataset = combine(framework::dataset::make("FormatType", { Format::RGBA8888 }),
-                                               framework::dataset::make("ChannelType", { Channel::R, Channel::G, Channel::B, Channel::A }));
-const auto ChannelExtractYUVDataset = combine(framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 }),
-                                              framework::dataset::make("ChannelType", { Channel::Y, Channel::U, Channel::V }));
-const auto ChannelExtractYUVPlanarDataset = combine(framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444, Format::NV12, Format::NV21 }),
-                                                    framework::dataset::make("ChannelType", { Channel::Y, Channel::U, Channel::V }));
-
-inline void validate_configuration(const TensorShape &shape, Format format, Channel channel)
-{
-    const unsigned int num_planes = num_planes_from_format(format);
-
-    TensorShape dst_shape = adjust_odd_shape(shape, format);
-    dst_shape             = calculate_subsampled_shape(dst_shape, format, channel);
-
-    // Create tensors
-    MultiImage ref_src = create_multi_image<MultiImage>(shape, format);
-    Tensor     dst     = create_tensor<Tensor>(dst_shape, Format::U8);
-
-    // Create and Configure function
-    NEChannelExtract channel_extract;
-
-    if(1U == num_planes)
-    {
-        const Tensor *plane_src = ref_src.plane(0);
-
-        channel_extract.configure(plane_src, channel, &dst);
-    }
-    else
-    {
-        channel_extract.configure(&ref_src, channel, &dst);
-    }
-
-    // TODO(bsgcomp): Add validation for padding and shape (COMPMID-659)
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(ChannelExtract)
-
-template <typename T>
-using NEChannelExtractFixture = ChannelExtractValidationFixture<MultiImage, Tensor, Accessor, NEChannelExtract, T>;
-
-TEST_SUITE(Configuration)
-DATA_TEST_CASE(RGBA, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ChannelExtractRGBADataset),
-               shape, format, channel)
-{
-    validate_configuration(shape, format, channel);
-}
-DATA_TEST_CASE(YUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ChannelExtractYUVDataset),
-               shape, format, channel)
-{
-    validate_configuration(shape, format, channel);
-}
-
-DATA_TEST_CASE(YUVPlanar, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ChannelExtractYUVPlanarDataset),
-               shape, format, channel)
-{
-    validate_configuration(shape, format, channel);
-}
-TEST_SUITE_END() // Configuration
-
-TEST_SUITE(RGBA)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ChannelExtractRGBADataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ChannelExtractRGBADataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // RGBA
-
-TEST_SUITE(YUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ChannelExtractYUVDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ChannelExtractYUVDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // YUV
-
-TEST_SUITE(YUVPlanar)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ChannelExtractYUVPlanarDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ChannelExtractYUVPlanarDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // YUVPlanar
-
-TEST_SUITE_END() // ChannelExtract
-TEST_SUITE_END() // NEON
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ChannelShuffle.cpp b/tests/validation/NEON/ChannelShuffle.cpp
index ec84183f09..9a2a9f24f0 100644
--- a/tests/validation/NEON/ChannelShuffle.cpp
+++ b/tests/validation/NEON/ChannelShuffle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,25 +70,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallRandomChannelShuffleLayerDataset(),
-                       framework::dataset::make("DataType", { DataType::S8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32 })),
-               shape, num_groups, data_type)
-{
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(shape, data_type);
-
-    // Create and Configure function
-    NEChannelShuffleLayer channel_shuffle_func;
-    channel_shuffle_func.configure(&ref_src, &dst, num_groups);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEChannelShuffleLayerFixture = ChannelShuffleLayerValidationFixture<Tensor, Accessor, NEChannelShuffleLayer, T>;
 
@@ -153,7 +134,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ChannelShuffle
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Col2Im.cpp b/tests/validation/NEON/Col2Im.cpp
index e4a52f25dd..7eb8cbf0f6 100644
--- a/tests/validation/NEON/Col2Im.cpp
+++ b/tests/validation/NEON/Col2Im.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,8 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NECol2Im.h"
+#include "src/cpu/kernels/CpuCol2ImKernel.h"
+#include "tests/NEON/Helper.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -38,6 +39,8 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(Col2Im)
 
+using CpuCol2Im = NESynthetizeFunction<cpu::kernels::CpuCol2ImKernel>;
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
@@ -56,7 +59,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                framework::dataset::make("Expected", { false, false, false, true })),
                input_info, output_info, convolved_width, convolved_height, expected)
 {
-    bool status = bool(NECol2Im::validate(&input_info, &output_info, Size2D(convolved_width, convolved_height)));
+    bool status = bool(CpuCol2Im::validate(&input_info, &output_info, Size2D(convolved_width, convolved_height)));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
diff --git a/tests/validation/NEON/ColorConvert.cpp b/tests/validation/NEON/ColorConvert.cpp
deleted file mode 100644
index c776479d79..0000000000
--- a/tests/validation/NEON/ColorConvert.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/MultiImage.h"
-#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ColorConvertFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_nv(2);
-constexpr AbsoluteTolerance<uint8_t> tolerance_u8(2);
-
-// Input data sets
-const auto RGBDataset  = framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 });
-const auto YUYVDataset = framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 });
-
-const auto ColorConvert_RGBA_to_RGB = combine(framework::dataset::make("FormatType", { Format::RGBA8888 }),
-                                              framework::dataset::make("FormatType", { Format::RGB888 }));
-
-const auto ColorConvert_RGB_to_RGBA = combine(framework::dataset::make("FormatType", { Format::RGB888 }),
-                                              framework::dataset::make("FormatType", { Format::RGBA8888 }));
-
-const auto ColorConvert_RGB_to_U8 = combine(framework::dataset::make("FormatType", { Format::RGB888 }),
-                                            framework::dataset::make("FormatType", { Format::U8 }));
-
-const auto ColorConvert_YUYVDataset_to_RGBDataset = combine(YUYVDataset,
-                                                            RGBDataset);
-
-const auto ColorConvert_YUVPlanar_to_RGBDataset = combine(framework::dataset::make("FormatType", { Format::IYUV, Format::NV12, Format::NV21 }),
-                                                          RGBDataset);
-
-const auto ColorConvert_RGBDataset_to_NVDataset = combine(RGBDataset,
-                                                          framework::dataset::make("FormatType", { Format::NV12, Format::IYUV, Format::YUV444 }));
-
-const auto ColorConvert_YUYVDataset_to_NVDataset = combine(YUYVDataset,
-                                                           framework::dataset::make("FormatType", { Format::NV12, Format::IYUV }));
-
-const auto ColorConvert_NVDataset_to_YUVDataset = combine(framework::dataset::make("FormatType", { Format::NV12, Format::NV21 }),
-                                                          framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444 }));
-
-inline void validate_configuration(const TensorShape &shape, Format src_format, Format dst_format)
-{
-    const unsigned int src_num_planes = num_planes_from_format(src_format);
-    const unsigned int dst_num_planes = num_planes_from_format(dst_format);
-
-    TensorShape input = adjust_odd_shape(shape, src_format);
-    input             = adjust_odd_shape(input, dst_format);
-
-    // Create tensors
-    MultiImage ref_src = create_multi_image<MultiImage>(input, src_format);
-    MultiImage ref_dst = create_multi_image<MultiImage>(input, dst_format);
-
-    // Create and Configure function
-    NEColorConvert color_convert;
-
-    if(1U == src_num_planes)
-    {
-        const Tensor *src_plane = ref_src.plane(0);
-
-        if(1U == dst_num_planes)
-        {
-            Tensor *dst_plane = ref_dst.plane(0);
-            color_convert.configure(src_plane, dst_plane);
-        }
-        else
-        {
-            color_convert.configure(src_plane, &ref_dst);
-        }
-    }
-    else
-    {
-        if(1U == dst_num_planes)
-        {
-            Tensor *dst_plane = ref_dst.plane(0);
-            color_convert.configure(&ref_src, dst_plane);
-        }
-        else
-        {
-            color_convert.configure(&ref_src, &ref_dst);
-        }
-    }
-
-    for(unsigned int plane_idx = 0; plane_idx < src_num_planes; ++plane_idx)
-    {
-        const Tensor *src_plane = ref_src.plane(plane_idx);
-
-        ARM_COMPUTE_EXPECT(src_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-    for(unsigned int plane_idx = 0; plane_idx < dst_num_planes; ++plane_idx)
-    {
-        const Tensor *dst_plane = ref_dst.plane(plane_idx);
-
-        ARM_COMPUTE_EXPECT(dst_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(ColorConvert)
-
-template <typename T>
-using NEColorConvertFixture = ColorConvertValidationFixture<MultiImage, Tensor, Accessor, NEColorConvert, T>;
-
-TEST_SUITE(Configuration)
-DATA_TEST_CASE(RGBA, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGBA_to_RGB),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(RGB, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_RGBA),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(RGBtoU8, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_U8),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(YUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_RGBDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(YUVPlanar, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_YUVPlanar_to_RGBDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(NV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGBDataset_to_NVDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(YUYVtoNV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_NVDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(NVtoYUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_NVDataset_to_YUVDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-TEST_SUITE_END() // Configuration
-
-TEST_SUITE(RGBA)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_RGBA_to_RGB))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGBA_to_RGB))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // RGBA
-
-TEST_SUITE(RGB)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_RGBA))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGB_to_RGBA))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // RGB
-
-TEST_SUITE(RGBtoU8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_U8))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx], tolerance_u8);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGB_to_U8))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx], tolerance_u8);
-    }
-}
-TEST_SUITE_END() // RGBtoU8
-
-TEST_SUITE(YUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_YUYVDataset_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUV
-
-TEST_SUITE(YUVPlanar)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_YUVPlanar_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_YUVPlanar_to_RGBDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUVPlanar
-
-TEST_SUITE(NV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_RGBDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx], tolerance_nv);
-    }
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_RGBDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx], tolerance_nv);
-    }
-}
-TEST_SUITE_END() // NV
-
-TEST_SUITE(YUYVtoNV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_YUYVDataset_to_NVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // YUYVtoNV
-
-TEST_SUITE(NVtoYUV)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_NVDataset_to_YUVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEColorConvertFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), ColorConvert_NVDataset_to_YUVDataset))
-{
-    // Validate output
-    for(unsigned int plane_idx = 0; plane_idx < _dst_num_planes; ++plane_idx)
-    {
-        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
-    }
-}
-TEST_SUITE_END() // NVtoYUV
-
-TEST_SUITE_END() // ColorConvert
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Comparisons.cpp b/tests/validation/NEON/Comparisons.cpp
index f080c834e5..b77bcdd4f0 100644
--- a/tests/validation/NEON/Comparisons.cpp
+++ b/tests/validation/NEON/Comparisons.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,15 +43,6 @@ namespace validation
 {
 namespace
 {
-const auto configure_dataset = combine(datasets::SmallShapes(),
-                                       framework::dataset::make("DataType", { DataType::QASYMM8,
-                                                                              DataType::QASYMM8_SIGNED,
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                              DataType::F16,
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                                                                              DataType::F32
-                                                                            }));
-
 const auto run_small_dataset           = combine(datasets::ComparisonOperations(), datasets::SmallShapes());
 const auto run_small_broadcast_dataset = combine(datasets::ComparisonOperations(), datasets::SmallShapesBroadcast());
 const auto run_large_dataset           = combine(datasets::ComparisonOperations(), datasets::LargeShapes());
@@ -94,6 +85,17 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 template <typename T>
 using NEComparisonFixture = ComparisonValidationFixture<Tensor, Accessor, NEElementwiseComparison, T>;
 
+TEST_SUITE(Bool)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEComparisonFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::U8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
diff --git a/tests/validation/NEON/ConvertFullyConnectedWeights.cpp b/tests/validation/NEON/ConvertFullyConnectedWeights.cpp
index 90509847ac..65dbbcc55b 100644
--- a/tests/validation/NEON/ConvertFullyConnectedWeights.cpp
+++ b/tests/validation/NEON/ConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -131,7 +131,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEConvertFullyConnectedWeightsFixture<uint8_t>,
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE_END() // ConvertFullyConnectedWeights
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Convolution.cpp b/tests/validation/NEON/Convolution.cpp
deleted file mode 100644
index b942ddc408..0000000000
--- a/tests/validation/NEON/Convolution.cpp
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ConvolutionFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance value for comparing reference's output against implementation
- *
- * This is due to the fact that NEON target performs multiplication with reciprocal of scale,
- * while reference performs direct division with scale.
- */
-constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
-constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(CustomConvolution)
-TEST_SUITE(Square3x3)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 3 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 9> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolution3x3 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution3x3, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 3 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 3 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square3x3
-
-TEST_SUITE(Square5x5)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 5 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 25> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolution5x5 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(2);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-2);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution5x5, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square5x5
-
-TEST_SUITE(Square7x7)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 7 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 49> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolution7x7 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(3);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-3);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution7x7, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square7x7
-
-TEST_SUITE(Square9x9)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 9 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 81> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolution9x9 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(4);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-4);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution9x9, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Square9x9
-
-TEST_SUITE(Rectangle)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType",
-{ DataType::U8, DataType::S16 })),
-datasets::BorderModes()),
-framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-framework::dataset::make("filter_height", { 3, 5, 7, 9 })),
-shape, output_data_type, border_mode, filter_width, filter_height)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::vector<int16_t> conv(filter_height * filter_width);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolutionRectangle convolution;
-    convolution.configure(&src, &dst, conv.data(), filter_width, filter_height, 1, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_height / 2, filter_width / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(filter_width / 2);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-(filter_width / 2));
-
-    const PaddingSize width_padding = calculator.required_padding();
-
-    calculator.set_border_size(filter_height / 2);
-    calculator.set_access_offset(-(filter_height / 2));
-    const PaddingSize height_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), width_padding, height_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEConvolutionFixture = ConvolutionRectangleValidationFixture<Tensor, Accessor, NEConvolutionRectangle, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                                 framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-                                                                                                         framework::dataset::make("filter_height", { 3, 5, 7, 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                                 framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-                                                                                                         framework::dataset::make("filter_height", { 3, 5, 7, 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Rectangle
-
-TEST_SUITE(Separable5x5)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSeparableValidationFixture<Tensor, Accessor, NEConvolution5x5, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 5 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable5x5
-
-TEST_SUITE(Separable7x7)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSeparableValidationFixture<Tensor, Accessor, NEConvolution7x7, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 7 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable7x7
-
-TEST_SUITE(Separable9x9)
-template <typename T>
-using NEConvolutionFixture = ConvolutionSeparableValidationFixture<Tensor, Accessor, NEConvolution9x9, T>;
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::U8)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_u8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                                 framework::dataset::make("DataType",
-                                                                                                                         DataType::S16)),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("filter_size", { 9 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(_height / 2, _width / 2)), tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Separable9x9
-
-TEST_SUITE_END() // CustomConvolution
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Convolution3D.cpp b/tests/validation/NEON/Convolution3D.cpp
new file mode 100644
index 0000000000..4185488742
--- /dev/null
+++ b/tests/validation/NEON/Convolution3D.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEConv3D.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/DirectConvolution3DFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for FP16 types */
+const AbsoluteTolerance<float>            abs_tolerance_f16(0.2f);                   /**< Absolute tolerance for FP16 types */
+constexpr float                           tolerance_num = 0.07f;                     /**< Tolerance number for the FP16 implementation */
+#endif                                                                               /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<float>   tolerance_fp32(0.001f);                         /**< Tolerance for floating point tests */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);                           /**< Tolerance for quantized tests */
+
+/** Activation function Dataset*/
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+});
+
+const auto data_precommit = combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                    datasets::SmallDirectConv3DShapes(),
+                                                                                    framework::dataset::make("StrideX", { 1, 5, 8 })),
+                                                                                framework::dataset::make("StrideY", { 1, 2, 3 })),
+                                                                            framework::dataset::make("StrideZ", { 1, 2, 1 })),
+                                                                        framework::dataset::make("PadX", { 0, 1, 2 })),
+                                                                    framework::dataset::make("PadY", { 0, 2, 1 })),
+                                                                framework::dataset::make("PadZ", { 0, 3, 5 })),
+                                                            framework::dataset::make("KernelWidth", { 3, 5, 9 })),
+                                                        framework::dataset::make("KernelHeight", { 2, 1, 3 })),
+                                                    framework::dataset::make("KernelDepth", { 1, 2, 3 })),
+                                                framework::dataset::make("NumKernels", { 2, 3, 8 })),
+                                            framework::dataset::make("HasBias", { true, false })),
+                                    ActivationFunctionsDataset);
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(Convolution3D)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Mismatching data type input/weights
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Mismatching input feature maps
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid weights dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NHWC), // Invalid data layout
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid biases size
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid biases dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::F32, DataLayout::NDHWC), // Invalid output size
+                                                TensorInfo(TensorShape(27U, 13U, 2U, 4U), 1U, DataType::U32, DataLayout::NDHWC), // Invalid data type
+                                              }),
+        framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F16),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 3U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U, 3U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::F32),
+                                                 TensorInfo(TensorShape(4U, 3U, 3U, 3U, 2U), 1U, DataType::U32),
+                                              })),
+        framework::dataset::make("BiasesInfo",{ TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(3U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U, 2U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(4U), 1U, DataType::F32),
+                                              })),
+        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(26U, 11U, 4U), 1U, DataType::F32),
+                                                TensorInfo(TensorShape(25U, 11U, 4U), 1U, DataType::U32),
+                                              })),
+        framework::dataset::make("Expected", { false, false, false, false, false, false, false, false})),
+        input_info, weights_info, biases_info, output_info, expected)
+{
+        const Conv3dInfo  conv3d_info(Size3D(1, 1, 1), Padding3D(0, 0, 0), ActivationLayerInfo(), Size3D(1U, 1U, 1U), DimensionRoundingType::FLOOR, false);
+        bool is_valid = bool(NEConv3D::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv3d_info));
+        ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEDirectConvolution3DFixture = DirectConvolution3DValidationFixture<Tensor, Accessor, NEConv3D, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(data_precommit,
+                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NDHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // FP32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(data_precommit,
+                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                framework::dataset::make("DataLayout", { DataLayout::NDHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE_END() // Float
+
+template <typename T>
+using NEDirectConvolution3DQuantizedFixture = DirectConvolution3DValidationQuantizedFixture<Tensor, Accessor, NEConv3D, T>;
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolution3DQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+                                                                                                                   framework::dataset::make("InputShape", { TensorShape(7U, 5U, 3U, 13U, 3U),
+                                                                                                                           TensorShape(15U, 7U, 11U, 7U),
+                                                                                                                           TensorShape(19U, 5U, 16U, 4U),
+                                                                                                                           TensorShape(13U, 5U, 17U, 2U)
+                                                                                                                                                          }),
+                                                                                                                   framework::dataset::make("StrideX", { 1, 3, 2, 1 })),
+                                                                                                               framework::dataset::make("StrideY", { 2, 1, 3, 1 })),
+                                                                                                           framework::dataset::make("StrideZ", { 3, 2, 1, 1 })),
+                                                                                                       framework::dataset::make("PadX", { 0, 2, 1, 0 })),
+                                                                                                   framework::dataset::make("PadY", { 1, 0, 2, 0 })),
+                                                                                               framework::dataset::make("PadZ", { 2, 1, 0, 0 })),
+                                                                                           framework::dataset::make("KernelWidth", { 3, 7, 5, 1 })),
+                                                                                       framework::dataset::make("KernelHeight", { 5, 3, 7, 1 })),
+                                                                                   framework::dataset::make("KernelDepth", { 7, 5, 3, 1 })),
+                                                                               framework::dataset::make("NumKernels", { 5, 3, 1, 11 })),
+                                                                           framework::dataset::make("HasBias", { true, true, true, false })),
+                                                                       framework::dataset::make("Activation", ActivationLayerInfo())),
+                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                       framework::dataset::make("DataLayout", DataLayout::NDHWC)),
+                                               framework::dataset::make("SrcQuantizationInfo", QuantizationInfo(0.1f, 10))),
+                                       framework::dataset::make("WeightsQuantizationInfo", QuantizationInfo(0.3f, 20))),
+                               framework::dataset::make("DstQuantizationInfo", QuantizationInfo(0.2f, 5))))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE_END() // Convolution3D
+TEST_SUITE_END() // Neon
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 19f69d120f..d739d4e1a4 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,21 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
+
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/LargeConvolutionLayerDataset.h"
 #include "tests/datasets/SmallConvolutionLayerDataset.h"
-#include "tests/datasets/TinyConvolutionLayerDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -45,6 +51,22 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
+namespace detail
+{
+template <>
+void configure_conv_function<NEGEMMConv2d, Tensor>(NEGEMMConv2d &func,
+                                                   Tensor *src, const Tensor *weights, const Tensor *bias, Tensor *dst,
+                                                   const PadStrideInfo &info, const WeightsInfo &weights_info,
+                                                   const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    ARM_COMPUTE_UNUSED(weights_info);
+
+    Conv2dInfo conv_info(info, dilation, act_info, false, num_groups);
+    func.configure(src, weights, bias, dst, conv_info);
+}
+} // namespace detail
 namespace
 {
 const RelativeTolerance<float> rel_tolerance_f32(0.01f);              /**< Relative tolerance for FP32 types */
@@ -62,10 +84,17 @@ const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2
 const AbsoluteTolerance<float>            abs_tolerance_f16(0.2f);                   /**< Absolute tolerance for FP16 types */
 constexpr float                           tolerance_num = 0.07f;                     /**< Tolerance number for the FP16 implementation */
 #endif                                                                               /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+
+#ifdef ARM_COMPUTE_ENABLE_SME
+// TODO(COMPMID-6011): SME kernels and the reference model use different rounding mode.
+// Temporarily increase the tolerance for quantized data.
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(1.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#else                                                      // ARM_COMPUTE_ENABLE_SME
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#endif                                                     // ARM_COMPUTE_ENABLE_SME
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -73,14 +102,41 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
     DataType::QASYMM8,
 });
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f)
 });
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto NoActivation = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+});
+
+const auto ActivationFunctionsDatasetNightly = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+#ifdef __aarch64__
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU),
+#endif // __aarch64__
+});
+
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(0.5f, 10),
     QuantizationInfo(0.3f, 3),
@@ -95,32 +151,32 @@ TEST_SUITE(ConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-                                          framework::dataset::make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F32),
+                                          make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(23U, 27U, 32U, 4U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32),
                                                                                   TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)
                                           }),
-                                          framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F32),
+                                          make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 32U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
                                                                                     TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
                                           })),
-                                          framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
+                                          make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(19U, 23U, 21U, 4U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
                                                                                    TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
                                           })),
-                                          framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                          make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(1, 1, 0, 0),
                                                                                  PadStrideInfo(2, 1, 0, 0),
                                                                                  PadStrideInfo(3, 2, 1, 0)
                                           })),
-                                          framework::dataset::make("FastMath", { true,
+                                          make("FastMath", { true,
                                                                                  true,
                                                                                  false,
                                                                                  false
                                           })),
-                                                                           framework::dataset::make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
+                                                                           make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
                input_info, weights_info, output_info, conv_info, fast_math, expected)
 {
     ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
@@ -132,30 +188,267 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
 // *INDENT-ON*
 TEST_SUITE_END() // ConvolutionLayer
 
+/*
+    Testing Strategy of Neon Winograd:
+        - There is no need to thoroughly test nchw cases because winograd kernels accept
+          nhwc and the tensors are permuted before and after if they're nchw.
+        - Except relu and bounded relu, testing activations for a single input
+          combination is enough because activation is not fused into winograd and called
+          separately.
+*/
 TEST_SUITE(WinogradLayer)
 template <typename T>
 using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T>;
+template <typename T>
+using NEWinogradConvolutionLayerMixedDataLayoutFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T, T, true, true>;
 
 template <typename T>
 using NEWinogradConvolutionLayerNoBiasFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T, T, false>;
 
+/** Test case for memory injection in @ref cpu::CpuWinogradConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto                winograd = std::make_unique<cpu::CpuWinogradConv2d>();
+    const auto          src_info = TensorInfo(TensorShape(8U, 8U, 32U), 1, DataType::F32);
+    const auto          w_info   = TensorInfo(TensorShape(1U), 1, DataType::F32);
+    const auto          b_info   = TensorInfo(TensorShape(1U, 3U, 32U, 1U), 1, DataType::F32);
+    auto                dst_info = TensorInfo(TensorShape(8U, 6U, 1U), 1, DataType::F32);
+    const PadStrideInfo pad_info{};
+
+    winograd->configure(&src_info, &b_info, &w_info, &dst_info, pad_info);
+
+    // telhs are newly created every call of this lambda function
+    auto a = create_tensor<Tensor>(src_info);
+    auto b = create_tensor<Tensor>(b_info);
+    auto c = create_tensor<Tensor>(w_info);
+    a.allocator()->allocate();
+    b.allocator()->allocate();
+    c.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &a }, { TensorType::ACL_SRC_1, &b }, { TensorType::ACL_SRC_2, &c } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &b }, { TensorType::ACL_SRC_2, &c } };
+
+    auto mg       = MemoryGroup{};
+    auto ws       = manage_workspace<Tensor>(winograd->workspace(), mg, run_pack, prep_pack);
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+        library->fill_tensor_value(Accessor(a), 1.f);
+        library->fill_tensor_value(Accessor(b), 2.f);
+        library->fill_tensor_value(Accessor(c), 3.f);
+
+        // This operator is configured once and captured by this lambda.
+        winograd->prepare(prep_pack);
+        winograd->run(run_pack);
+        return dst;
+    };
+
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEWinogradConvolutionLayer.
+ *
+ * Make sure @ref NEWinogradConvolutionLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto                gemm     = std::make_unique<NEWinogradConvolutionLayer>();
+    const auto          src_info = TensorInfo(TensorShape(8U, 8U, 32U), 1, DataType::F32);
+    const auto          w_info   = TensorInfo(TensorShape(1U), 1, DataType::F32);
+    const auto          b_info   = TensorInfo(TensorShape(1U, 3U, 32U, 1U), 1, DataType::F32);
+    auto                dst_info = TensorInfo(TensorShape(8U, 6U, 1U), 1, DataType::F32);
+    const PadStrideInfo pad_info{};
+
+    auto run_conv = [&]()
+    {
+        auto src = create_tensor<Tensor>(src_info);
+        auto w   = create_tensor<Tensor>(w_info);
+        auto b   = create_tensor<Tensor>(b_info);
+        auto dst = create_tensor<Tensor>(dst_info);
+
+        gemm->configure(&src, &b, &w, &dst, pad_info);
+
+        src.allocator()->allocate();
+        b.allocator()->allocate();
+        w.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(b), 2.f);
+        library->fill_tensor_value(Accessor(w), 3.f);
+        gemm->run();
+        return dst;
+    };
+
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+DATA_TEST_CASE(SupportedKernels, framework::DatasetMode::ALL, zip(
+                   make("WeightsInfo",
+{
+    // Shapes are always in NCHW format. When layout is NHWC, the shape is permuted
+
+    // Fp32, NCHW/NHWC (layout does not matter as it's )
+    // 3x1, 1x3, 3x3 --> all TRUE
+    TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // 5x1, 1x5, 5x5 --> all TRUE
+    TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // 7x1, 1x7, 7x7
+    //  --> all FALSE
+    TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+
+    // unsupported kernel sizes
+    TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F32, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F32, DataLayout::NCHW),
+
+    // Fp16
+    TensorInfo(TensorShape(3U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(1U, 3U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+    // 5x1, 1x5, 5x5 --> all TRUE
+    TensorInfo(TensorShape(5U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 5U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+    // 7x1, 1x7, 7x7
+    //  --> all FALSE
+    TensorInfo(TensorShape(7U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+    TensorInfo(TensorShape(1U, 7U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(7U, 1U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+
+    // unsupported kernel sizes
+    TensorInfo(TensorShape(2U, 2U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(5U, 2U, 2U, 8U), 1, DataType::F16, DataLayout::NHWC),
+    TensorInfo(TensorShape(3U, 6U, 2U, 8U), 1, DataType::F16, DataLayout::NCHW),
+
+}),
+make("Expected",
+{
+    // fp32
+    true, true, true,    // 3x3, 1x3, 3x1
+    true, true, true,    // 5x5, 1x5, 5x1
+    false, true, true,   // 7x7, 1x7, 7x1
+    false, false, false, // random unsupported kernels
+
+    // fp16
+    true, false, false,  // 3x3, 1x3, 3x1
+    false, false, false, // 5x5, 1x5, 5x1
+    false, false, false, // 7x7, 1x7, 7x1
+    false, false, false, // random unsupported kernels
+})),
+weights_info_const, expected_const)
+{
+    DataType   data_type   = weights_info_const.data_type();
+    DataLayout data_layout = weights_info_const.data_layout();
+
+    TensorInfo input_info   = TensorInfo(TensorShape(17U, 31U, 2U), 1, data_type);
+    TensorInfo bias_info    = TensorInfo(TensorShape(8U), 1, data_type);
+    TensorInfo weights_info = weights_info_const;
+
+    if(data_layout == DataLayout::NHWC)
+    {
+        // Convert to NHWC
+        PermutationVector perm = PermutationVector(2U, 0U, 1U);
+
+        TensorShape input_shape   = input_info.tensor_shape();
+        TensorShape weights_shape = weights_info.tensor_shape();
+        permute(input_shape, perm);
+        permute(weights_shape, perm);
+
+        input_info.set_tensor_shape(input_shape);
+        weights_info.set_tensor_shape(weights_shape);
+
+        input_info.set_data_layout(data_layout);
+        weights_info.set_data_layout(data_layout);
+        bias_info.set_data_layout(data_layout);
+    }
+
+    PadStrideInfo conv_info(1, 1, 0, 0);
+
+    TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, conv_info);
+    TensorInfo  output_info  = TensorInfo(output_shape, 1, data_type, data_layout);
+
+    Status status = NEWinogradConvolutionLayer::validate(
+                        &input_info,
+                        &weights_info,
+                        &bias_info,
+                        &output_info,
+                        conv_info,
+                        ActivationLayerInfo(),
+                        true /* fast math */);
+
+    Status fp16_supported = ::arm_compute::error_on_unsupported_cpu_fp16("N/A", "N/A", 0, &input_info);
+    bool   expected       = expected_const && static_cast<bool>(fp16_supported);
+
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(FP32)
 
 TEST_SUITE(Conv1x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEWinogradConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("Input", TensorShape(8U, 8U, 32U)),
+                           make("Weight", TensorShape(1U, 3U, 32U, 1U)),
+                           make("Bias", TensorShape(1U)),
+                           make("Output", TensorShape(8U, 6U, 1U)),
+                           make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                           make("Dilation", Size2D(1U, 1U)),
+                           make("DataType", { DataType::F32 }),
+                           ActivationFunctionsDataset,
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -165,19 +458,19 @@ TEST_SUITE_END() // Conv1x3
 
 TEST_SUITE(Conv3x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -187,19 +480,19 @@ TEST_SUITE_END() // Conv3x1
 
 TEST_SUITE(Conv1x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer1x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -209,19 +502,19 @@ TEST_SUITE_END() // Conv1x5
 
 TEST_SUITE(Conv5x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -231,10 +524,10 @@ TEST_SUITE_END() // Conv5x1
 
 TEST_SUITE(Conv7x1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer7x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
@@ -242,9 +535,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                                               make("DataType", { DataType::F32 })),
+                                       make("ActivationInfo", { ActivationLayerInfo() })),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -253,20 +546,20 @@ TEST_SUITE_END() // Conv7x1
 
 TEST_SUITE(Conv1x7)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer1x7Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer7x1Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_1xN_f32);
@@ -275,20 +568,40 @@ TEST_SUITE_END() // Conv1x7
 
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+
+{
+    // Validate output
+    validate(Accessor(_target), _reference, abs_tolerance_f32);
+}
 
+/// It's enough to run the activations for a single weight/input combination and data type because
+/// activation function is called on top of the winograd output as a separate operator
+/// TODO: Enable after COMPMID-6573 is resolved
+FIXTURE_DATA_TEST_CASE(RunActivations, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::DISABLED,
+                       combine(
+                           make("Input", TensorShape(3U, 3U, 32U)),
+                           make("Weight", TensorShape(3U, 3U, 32U, 4U)),
+                           make("Bias", TensorShape(4U)),
+                           make("Output", TensorShape(1U, 1U, 4U)),
+                           make("PadStrideInfo", PadStrideInfo(1, 1, 0, 0)),
+                           make("Dilation", Size2D(1U, 1U)),
+                           make("DataType", { DataType::F32 }),
+                           ActivationFunctionsDatasetNightly,
+                           make("DataLayout", { DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -299,20 +612,20 @@ TEST_SUITE_END() // Conv3x3
 
 TEST_SUITE(Conv5x5)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
+                               make("DataType", { DataType::F32 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -322,12 +635,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEWinogradConvolutionLayerFixture<float>, frame
 TEST_SUITE_END() // Conv5x5
 
 FIXTURE_DATA_TEST_CASE(RunSmallNoBias, NEWinogradConvolutionLayerNoBiasFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                                                          datasets::SmallWinogradConvolutionLayer5x5Dataset()),
-                                               framework::dataset::make("DataType", { DataType::F32 })),
-                                       ActivationFunctionsDataset),
-
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(framework::dataset::concat(
+                                   datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                   datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+                               make("DataType", { DataType::F32 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, abs_tolerance_f32);
@@ -339,12 +652,39 @@ TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 using CLWinogradConvolutionLayerFastMathFixture16 = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, half, float>;
 
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(
+                   make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16),
+                                       TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F16)
+                                     }),
+                   make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16),
+                                         TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F16)
+                                       }),
+                   make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32),
+                                        TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F16)
+                                      }),
+                   make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                      PadStrideInfo(1, 1, 0, 0)
+                                    }),
+                   make("FastMath",
+{
+    false, // case fp16 and fast_math False then disable Winograd
+    true   // case fp16 and fast_math True then enable Winograd
+}),
+make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD })),
+input_info, weights_info, output_info, conv_info, fast_math, expected)
+{
+    ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+                                                                            &weights_info.clone()->set_is_resizable(true),
+                                                                            &output_info.clone()->set_is_resizable(true), conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), fast_math);
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(Conv3x3)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F16 }),
+                               ActivationFunctionsDataset,
+                               make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 
 {
     // Validate output
@@ -352,10 +692,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture16, fr
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture16, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
-                                               framework::dataset::make("DataType", { DataType::F16 })),
-                                       ActivationFunctionsDataset),
-                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+                       combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                               make("DataType", { DataType::F16 }),
+                               make("ActivationInfo", { ActivationLayerInfo() }),
+                               make("DataLayout", { DataLayout::NHWC })))
 
 {
     // Validate output
@@ -366,16 +706,470 @@ TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // WinogradLayer
 
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+TEST_SUITE(FIXED_FORMAT_KERNELS)
+TEST_SUITE(VariableWeightUtils)
+
+// UC2_1_* tests: the user requests a specific fixed format, but there is no kernel that supports it.
+
+template <typename ConvolutionClass>
+using HasOptImplFixtureNoFastMath = HasOptImplFixture<ConvolutionClass, /*enable_fast_math*/ false>;
+
+template <typename ConvolutionClass>
+using HasOptImplFixtureFastMath = HasOptImplFixture<ConvolutionClass, /*enable_fast_math*/ true>;
+
+// UC2_1
+
+FIXTURE_DATA_TEST_CASE(UC2_1_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+FIXTURE_DATA_TEST_CASE(UC2_1_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_1_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_1_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo2 })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+// UC2_2_* tests: the user requests a specific fixed format, and a
+// kernel that support that fixed format is found.
+
+FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo4 })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo4, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo4 })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo4, framework::LogLevel::ERRORS);
+}
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+// These tests currently only works with SVE length 256
+// If other SVE length is used a kernel will fail to be found
+// This needs to be addressed in order to ensure it doesn't revert to FP32 kernels for systems with SVE length other than 256
+FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
+{
+    if(Scheduler::get().cpu_info().has_bf16() && (arm_gemm::utils::get_vector_length<float>() == 8)){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT_EQUAL(_computed_weight_format, arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
+{
+    if(Scheduler::get().cpu_info().has_bf16() && (arm_gemm::utils::get_vector_length<float>() == 8)){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+    }
+}
+
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+// UC3_1_* tests: the user queries for ANY fixed format, but there is
+// no kernel that support the use case specified by the user (for
+// example, there is no fixed format kernel for the datatype of the
+// problem).
+
+FIXTURE_DATA_TEST_CASE(UC3_1_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_1_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::S32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+}
+
+// UC3_2_* tests: the user queries for ANY fixed format. The search
+// succeeded and the fixed format found is prompted back for
+// consumption by the user. Note that we just test the
+// _computed_weight_format to be anything but not the formats that are
+// not fixed formats (ANY and UNSPECIFIED). This is because the weight
+// format that the runtime produces depends on the size of the vector
+// units of the hardware where the tests is executed. For example, a
+// format like OHWIo4 for FP32 data returned for 128-bit NEON hardware
+// is replaced by OHWIo8 when running on 256-bit SVE.
+
+FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d, HasOptImplFixtureNoFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+}
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+
+FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    if(Scheduler::get().cpu_info().has_bf16()){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
+                       combine(framework::dataset::make("DataType", { DataType::F32 }),
+                               framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
+{
+    if(Scheduler::get().cpu_info().has_bf16()){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+}
+
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+namespace
+{
+using TestCaseType          = std::tuple<TensorShape, TensorShape, arm_compute::WeightFormat>;
+auto prepare_weights_shapes = framework::dataset::make("TensorShape",
+{
+    // OHWIo<interleave_by>i<block_by>
+    //
+    // OHWI --> O'HWI', where:
+    //
+    //   O'= smallest multiple of <interleave_by> such that O<=O'
+    //   I'= smallest multiple of <block_by> such that I<=I'
+    //
+
+    // Change N for OHWIo4
+    TestCaseType({ { 1U, 1U, 1U, 1U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 2U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 3U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 4U }, { 1U, 1U, 1U, 4U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 5U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 6U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 7U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 8U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1U, 1U, 1U, 9U }, { 1U, 1U, 1U, 12U }, arm_compute::WeightFormat::OHWIo4 }),
+    // // Change N for OHWIo8
+    TestCaseType({ { 1U, 1U, 1U, 1U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 2U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 3U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 4U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 5U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 6U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 7U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 8U }, { 1U, 1U, 1U, 8U }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1U, 1U, 1U, 9U }, { 1U, 1U, 1U, 16U }, arm_compute::WeightFormat::OHWIo8 }),
+    // // Change N for OHWIo4 when H, W and C are not 1
+    TestCaseType({ { 3U, 4U, 2U, 1U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 2U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 3U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 4U }, { 3, 4, 2, 4 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 5U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 6U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 7U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 8U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 9U }, { 3, 4, 2, 12 }, arm_compute::WeightFormat::OHWIo4 }),
+
+    // // Fix N and move HWI around, with different data layouts and formats
+    TestCaseType({ { 2U, 4U, 3U, 5U }, { 2, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 3U, 4U, 2U, 5U }, { 3, 4, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 2U, 4U, 3U, 9U }, { 2, 4, 3, 16 }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 3U, 4U, 2U, 9U }, { 3, 4, 2, 16 }, arm_compute::WeightFormat::OHWIo8 }),
+    TestCaseType({ { 1024U, 1U, 1U, 1001U }, { 1024, 1, 1, 1008 }, arm_compute::WeightFormat::OHWIo8 }),
+
+    // // Adding <block_by> on I (=C)
+    TestCaseType({ { 1U, 4U, 3U, 5U }, { 2, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4i2 }),
+    TestCaseType({ { 2U, 4U, 3U, 5U }, { 2, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4i2 }),
+    TestCaseType({ { 3U, 4U, 3U, 5U }, { 4, 4, 3, 8 }, arm_compute::WeightFormat::OHWIo4i2 }),
+
+    // ---------
+    TestCaseType({ { 2, 2, 1, 5 }, { 2, 2, 1, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+    TestCaseType({ { 1, 2, 2, 5 }, { 1, 2, 2, 8 }, arm_compute::WeightFormat::OHWIo4 }),
+
+});
+} // unnamed namespace
+
+DATA_TEST_CASE(PrepareWeightShape, framework::DatasetMode::ALL,
+               prepare_weights_shapes, shapes)
+{
+    const TensorShape               input_shape    = std::get<0>(shapes);
+    const TensorShape               expected_shape = std::get<1>(shapes);
+    const arm_compute::WeightFormat wf             = std::get<2>(shapes);
+    const DataType                  DT             = DataType::F32;
+    const DataLayout                DL             = DataLayout::NHWC;
+    const auto                      TI             = TensorInfo(input_shape, 1 /*num_channels, deprecated*/, DT, DL);
+    const TensorInfo                computed_info  = ::arm_compute::test::validation::prepare_weights(TI, wf);
+    ARM_COMPUTE_EXPECT_EQUAL(computed_info.tensor_shape(), expected_shape, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE_END() // VariableWeightUtils
+
+TEST_SUITE(ExperimentalCpuAPIVariableWeightWithFixtures)
+
+template <typename ScalarType>
+using VarWidth = VariableWeightsFixture<cpu::CpuGemmConv2d, Tensor, Accessor, ScalarType, /*enable_fast_math*/ false>;
+
+FIXTURE_DATA_TEST_CASE(RunSmallFloat, VarWidth<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+FIXTURE_DATA_TEST_CASE(RunSmallHalf, VarWidth<half>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F16 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, half(abs_tolerance_f16));
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+template <typename ScalarType>
+using VarWidthFastMath = VariableWeightsFixture<cpu::CpuGemmConv2d, Tensor, Accessor, ScalarType, /*enable_fast_math*/ true>;
+
+FIXTURE_DATA_TEST_CASE(RunSmallFloatFastMath, VarWidthFastMath<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+TEST_SUITE_END() // ExperimentalCpuAPIVariableWeightWithFixtures
+
+TEST_SUITE(ExperimentalNEAPIVariableWeightWithFixtures)
+
+template <typename ScalarType>
+using NEGEMMVarWidth = VariableWeightsFixtureNEInterface<NEGEMMConvolutionLayer, Tensor, Accessor, ScalarType, /*enable_fast_math*/ false>;
+
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallFloat, NEGEMMVarWidth<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallHalf, NEGEMMVarWidth<half>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F16 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, 0.f, half(abs_tolerance_f16));
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+template <typename ScalarType>
+using NEGEMMVarWidthFastMath = VariableWeightsFixtureNEInterface<NEGEMMConvolutionLayer, Tensor, Accessor, ScalarType, /*enable_fast_math*/ true>;
+
+FIXTURE_DATA_TEST_CASE(NEGEMMRunSmallFloatFastMath, NEGEMMVarWidthFastMath<float>, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                               framework::dataset::make("ACL Scalar type", { DataType::F32 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+TEST_SUITE_END() // ExperimentalNEAPIVariableWeightWithFixtures
+TEST_SUITE_END() // FIXED_FORMAT_KERNELS
+
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+
 TEST_SUITE(GEMMConvolutionLayer)
 template <typename T>
-using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+template <typename T>
+using NEGEMMConvolutionLayerPaddedWeightsFixture = ConvolutionValidationPaddedWeightsFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+template <typename T>
+using NEGEMMConvolutionLayerMixedDataLayoutFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T, true>;
+
+/** Test case for memory injection in @ref cpu::CpuGemmConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto        conv        = std::make_unique<cpu::CpuGemmConv2d>();
+    const auto  src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NCHW);
+    auto        dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  conv_info   = PadStrideInfo(1, 1, 0, 0, 2, 2, DimensionRoundingType::FLOOR);
+    WeightsInfo weights_info(false, 3U, 3U, 1U);
+    conv->configure(&src_info, &weight_info, &bias_info, &dst_info, conv_info, weights_info);
+
+    // tensors are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(conv->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        conv->prepare(prep_pack);
+        conv->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMMConvolutionLayer.
+ *
+ * Make sure @ref NEGEMMConvolutionLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto        conv        = std::make_unique<NEGEMMConvolutionLayer>();
+    const auto  src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NCHW);
+    auto        dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NCHW);
+    const auto  conv_info   = PadStrideInfo(1, 1, 0, 0, 2, 2, DimensionRoundingType::FLOOR);
+    WeightsInfo weights_info(false, 3U, 3U, 1U);
+    auto        run_conv = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        conv->configure(&src, &weight, &bias, &dst, conv_info, weights_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        conv->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
 
 TEST_SUITE(Float)
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
+#if defined(ARM_COMPUTE_ENABLE_BF16)
 TEST_SUITE(BFLOAT16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                    framework::dataset::make("DataType", DataType::BFLOAT16)),
+                                                                                                                    framework::dataset::make("DataType", Scheduler::get().cpu_info().has_bf16() ? DataType::BFLOAT16 : DataType::F32)),
                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                                                                                             ActivationFunctionsDataset))
 {
@@ -383,7 +1177,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
 }
 TEST_SUITE_END() // BFLOAT16
-#endif           /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
+#endif           /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -409,14 +1203,62 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                           framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                           framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                       framework::dataset::make("Bias", TensorShape(2U))),
+                                                                               framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
+                                                                       framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                               framework::dataset::make("Dilation", Size2D(1, 1))),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::F32)),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                               ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+/** Padded weights
+ * CpuGemmConv2d uses two different paths for reshaping the weights based on if the weight tensor has holes (a common
+ * way to have "holes" in tensor is via extended paddings)
+ *
+ * We only need to test the padded weight path here on a single floating data type and a single layout, because the fallback path is agnostic of them
+ */
+FIXTURE_DATA_TEST_CASE(RunPaddedWeights, NEGEMMConvolutionLayerPaddedWeightsFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true }),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC })
+                                                                                                            ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
+// This very large shape test is required to test heuristic paths where the tensor size is > 1e7 bytes
+// and weight dimensions larger than 7
+FIXTURE_DATA_TEST_CASE(RunVeryLarge, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::VeryLargeConvolutionLayerDataset(),
+        framework::dataset::make("ReshapeWeights", { true }),
+        framework::dataset::make("DataType", DataType::F32),
+        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+// TODO: COMPMID-6596 Extend quantized tests with at least one suite where the weight is padded (the legacy case, see floating point's RunPaddedWeights)
+template <typename T>
+using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T>;
 template <typename T>
-using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+using NEGEMMConvolutionLayerQuantizedMixedDataLayoutFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T, true>;
 
 template <typename T>
-using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T, int8_t>;
+using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEConvolutionLayer, T, int8_t>;
 
 const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 {
@@ -425,17 +1267,39 @@ const auto QuantizedActivationFunctionsDataset = framework::dataset::make("Activ
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
 });
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, these two versions should be merged
+/// again, with the explicitly specified quantization info removed
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                                                                                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
                                                                                                                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                   framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                                   framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                               framework::dataset::make("Bias", TensorShape(2U))),
+                                                                                       framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
+                                                                               framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                                       framework::dataset::make("Dilation", Size2D(1, 1))),
+                                                               framework::dataset::make("ReshapeWeights", { true })),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
+                               QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
@@ -443,12 +1307,29 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<int8_t>,
                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
+                                                                                                                      framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(0.01f, -10) })),
                                                                                                                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                                                   framework::dataset::make("Input", TensorShape(23U, 27U, 5U)),
+                                                                                                   framework::dataset::make("Weights", TensorShape(3U, 3U, 5U, 2U))),
+                                                                                               framework::dataset::make("Bias", TensorShape(2U))),
+                                                                                       framework::dataset::make("Output", TensorShape(11U, 25U, 2U))),
+                                                                               framework::dataset::make("PadStrideInfo", PadStrideInfo(2, 1, 0, 0))),
+                                                                       framework::dataset::make("Dilation", Size2D(1, 1))),
+                                                               framework::dataset::make("ReshapeWeights", { true })),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) })),
+                               QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM8_PER_CHANNEL)
@@ -464,11 +1345,215 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedPerChannelFixtur
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEGEMMConvolutionLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                               framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                               QuantizationData),
+                                       QuantizedActivationFunctionsDataset),
+                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(MemoryStressLargeChannels, NEGEMMConvolutionLayerQuantizedPerChannelFixture<int8_t>,
+    framework::DatasetMode::ALL,
+        combine(
+            make("In", TensorShape(1U)),
+            make("Weights", TensorShape(1U, 1U, 1U, 17000U)),
+            make("Biases", TensorShape(17000U)),
+            make("Out", TensorShape(1U, 1U, 17000U)),
+            make("Info", PadStrideInfo(1, 1, 0, 0)),
+            make("Dilation", Size2D(1, 1)),
+            make("ReshapeWeights", { true }),
+            make("DataType", { DataType::QASYMM8_SIGNED }),
+            make("DataLayout", { DataLayout::NHWC }),
+            make("QuantizationInfo", QuantizationInfo(0.5f, 10)),
+            make("ActivationInfo", ActivationLayerInfo()),
+            make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // GEMMConvolutionLayer
-TEST_SUITE_END() // NEON
+
+TEST_SUITE(DirectGEMMConv2d)
+template <typename T>
+using NEDirectGEMMConv2dLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConv2d, T>;
+
+/** Test case for memory injection in @ref cpu::CpuGemmDirectConv2d.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       conv        = std::make_unique<cpu::CpuGemmDirectConv2d>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto conv_info   = Conv2dInfo{};
+    conv->configure(&src_info, &weight_info, &bias_info, &dst_info, conv_info);
+
+    // tensors are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(conv->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        conv->prepare(prep_pack);
+        conv->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMMConv2d.
+ *
+ * Make sure @ref NEGEMMConv2d still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       conv        = std::make_unique<NEGEMMConv2d>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NHWC);
+    const auto conv_info   = Conv2dInfo{};
+    auto       run_conv    = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        conv->configure(&src, &weight, &bias, &dst, conv_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        conv->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                             ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+#ifdef __aarch64__
+template <typename T>
+using NEDirectGEMMConv2dLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConv2d, T>;
+
+template <typename T>
+using NEDirectGEMMConv2dLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEGEMMConv2d, T, int8_t>;
+
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                                                                                                        QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
+                                                                                                                       QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEDirectGEMMConv2dLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                               framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                               QuantizationData),
+                                       QuantizedActivationFunctionsDataset),
+                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
+TEST_SUITE_END() // Quantized
+#endif           // __aarch64__
+
+TEST_SUITE_END() // DirectGEMMConv2d
+
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Copy.cpp b/tests/validation/NEON/Copy.cpp
index 350fd0085e..e2e5210e89 100644
--- a/tests/validation/NEON/Copy.cpp
+++ b/tests/validation/NEON/Copy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,7 +92,7 @@ TEST_SUITE_END() // U16
 TEST_SUITE_END() // FixedSeed
 
 TEST_SUITE_END() // Copy
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/CropResize.cpp b/tests/validation/NEON/CropResize.cpp
index c6a1046b7e..df7166bfdc 100644
--- a/tests/validation/NEON/CropResize.cpp
+++ b/tests/validation/NEON/CropResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -187,7 +187,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 TEST_SUITE_END() // S32
 
 TEST_SUITE_END() // CropResize
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DeconvolutionLayer.cpp b/tests/validation/NEON/DeconvolutionLayer.cpp
index d888d7b838..b4c049f6f9 100644
--- a/tests/validation/NEON/DeconvolutionLayer.cpp
+++ b/tests/validation/NEON/DeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,54 +47,86 @@ constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f);    /**< Tolerance for
 constexpr AbsoluteTolerance<float> tolerance_quantized(1.0f); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 const RelativeTolerance<half_float::half> tolerance_fp16(half_float::half(0.2f)); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr float                           tolerance_num_fp16 = 0.02f;             /**< Tolerance number for FP16 tests -- follows a slightly stricter approach compared to ConvolutionLayer tests */
 #endif                                                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-constexpr float tolerance_num = 0.07f;                                            /**< Tolerance number */
+constexpr float tolerance_num_quant = 0.07f;                                      /**< Tolerance number for quantized types */
 
 const auto data4x4 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 3)
-                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 3) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data3x3 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 2)
-                     * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data3x3_asymm = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadLeft", 0, 1)
-                           * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+                           * framework::dataset::make("PadRight", 0, 1) * framework::dataset::make("PadTop", 0, 1) * framework::dataset::make("PadBottom", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
-const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorShape{ 10U, 10U, 1U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
+const auto data9x9_small_asymm = framework::dataset::make("InputShape", TensorShape
+{
+    10U, 10U, 1U, 1U
+})
+*framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY", 2) *framework::dataset::make("PadLeft", 3) *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop",
+        3)  *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
-const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape{ 640U, 360U, 56U, 1U }) *framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY",
-                                 2)
-                                 *framework::dataset::make("PadLeft", 3)
-                                 *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop", 3) *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
+const auto data9x9_large_asymm = framework::dataset::make("InputShape", TensorShape
+{
+    640U, 360U, 56U, 1U
+})
+*framework::dataset::make("StrideX", 2) *framework::dataset::make("StrideY", 2) *framework::dataset::make("PadLeft", 3) *framework::dataset::make("PadRight", 4) *framework::dataset::make("PadTop",
+        3)  *framework::dataset::make("PadBottom", 4) *framework::dataset::make("NumKernels", { 1 });
 
 const auto data3x3_precommit = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 2) * framework::dataset::make("StrideY", 1, 2) * framework::dataset::make("PadX", 0, 2)
-                               * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels", { 3 });
+                               * framework::dataset::make("PadY", 0, 2) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
 const auto data1x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
-                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels", { 3 });
+                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
+
+const auto data5x1 = datasets::SmallDeconvolutionShapes() * framework::dataset::make("StrideX", 1, 4) * framework::dataset::make("StrideY", 1, 4) * framework::dataset::make("PadX", 0, 1)
+                     * framework::dataset::make("PadY", 0, 1) * framework::dataset::make("NumKernels",
+{
+    3
+});
 
-const auto data_layouts_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
+const auto data_layouts_dataset = framework::dataset::make("DataLayout",
+{
+    DataLayout::NCHW, DataLayout::NHWC
+});
 
-const auto add_bias_dataset = framework::dataset::make("AddBias", { true, false });
+const auto add_bias_dataset = framework::dataset::make("AddBias",
+{
+    true, false
+});
 
 const auto input_qinfo_dataset = framework::dataset::make("InputQInfo",
 {
     QuantizationInfo(1.f / 255.f, 0),
-    QuantizationInfo(2.f, 0),
+                     QuantizationInfo(2.f, 0),
 });
 
 const auto output_qinfo_dataset = framework::dataset::make("OutputQInfo",
 {
     QuantizationInfo(3.f / 255.f, 0),
-    QuantizationInfo(4.f, 0),
+                     QuantizationInfo(4.f, 0),
 });
+
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(DeconvolutionLayer)
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
@@ -104,6 +136,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),  // Invalid bias shape
                                             TensorInfo(TensorShape(13U, 11U, 4U, 3U), 1, DataType::F32), // Window shrink
                                             TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(2U,2U,1U,1U), 1, DataType::F32),    // Small shape no padding
+                                            TensorInfo(TensorShape(3U,26U,26U,1U), 1, DataType::F32),    // Negative padding
                                           }),
     framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
@@ -111,6 +145,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(3U, 2U, 2U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(3U, 3U, 4U), 1, DataType::F32),
                                               TensorInfo(TensorShape(1U, 1U, 2U, 4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(3U,3U,1U,1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U,1U,26U,88U), 1, DataType::F32),
                                           })),
     framework::dataset::make("BiasInfo",  { TensorInfo(TensorShape(1U), 1, DataType::F16),
                                             TensorInfo(TensorShape(1U), 1, DataType::F32),
@@ -118,6 +154,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(25U, 11U), 1, DataType::F32),
                                             TensorInfo(TensorShape(1U), 1, DataType::F32),
                                             TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(88U), 1, DataType::F32),
                                           })),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32),
@@ -125,6 +163,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                             TensorInfo(TensorShape(13U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(11U, 9U, 1U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(32U, 16U, 4U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(4U,4U,1U,1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(1U,78U,88U,1U), 1, DataType::F32),
                                           })),
     framework::dataset::make("PadStrideInfo", { PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 0, 0),
@@ -132,8 +172,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                 PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 1, 1),
                                                 PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(2, 3, 3, 1),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, false, false, true,true, false })),
     input_info, weights_info, bias_info, output_info, pad_info, expected)
 {
     bool is_valid = bool(NEDeconvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pad_info));
@@ -157,6 +199,9 @@ using NEDeconvolutionLayerAsymmFixture9x9 = DeconvolutionValidationAsymmFixture<
 template <typename T>
 using NEDeconvolutionLayerFixture1x1 = DeconvolutionValidationFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 1, 1>;
 
+template <typename T>
+using NEDeconvolutionLayerFixture5x1 = DeconvolutionValidationFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 5, 1>;
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 TEST_SUITE(W4x4)
@@ -220,6 +265,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerAsymmFixture9x9<float>, fra
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
 TEST_SUITE_END() // W9x9
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture5x1<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                    data_layouts_dataset),
+                                                                                                            add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // W5x1
 TEST_SUITE_END() // FP32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -230,7 +284,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture4x4<half>, framework::Dat
                                                                                                            add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 TEST_SUITE_END() // W4x4
 TEST_SUITE(W3x3)
@@ -240,14 +294,14 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerFixture3x3<half>, framework
                                                                                                                   add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data3x3, framework::dataset::make("DataType", DataType::F16)),
                                                                                                                         data_layouts_dataset),
                                                                                                                 add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 TEST_SUITE_END() // W3x3
 TEST_SUITE(W1x1)
@@ -256,9 +310,18 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture1x1<half>, framework::Dat
                                                                                                            add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_fp16);
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
 }
 TEST_SUITE_END() // W1x1
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerFixture5x1<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data5x1, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                   data_layouts_dataset),
+                                                                                                           add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16, tolerance_num_fp16);
+}
+TEST_SUITE_END() // W5x1
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
@@ -273,6 +336,21 @@ using NEDeconvolutionLayerQuantizedFixture3x3 = DeconvolutionValidationQuantized
 template <typename T>
 using NEDeconvolutionLayerQuantizedFixture1x1 = DeconvolutionValidationQuantizedFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 1, 1>;
 
+template <typename T>
+using NEDeconvolutionLayerQuantizedFixture5x1 = DeconvolutionValidationQuantizedFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 5, 1>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture4x4 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 4, 4>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture3x3 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 3, 3>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture1x1 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 1, 1>;
+
+template <typename T>
+using NEDeconvolutionLayerQuantizedPerChannelFixture5x1 = DeconvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDeconvolutionLayer, T, int8_t, 5, 1>;
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 
@@ -285,7 +363,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture4x4<uint8_t>, fr
                                                                                                                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W4x4
 
@@ -299,7 +377,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerQuantizedFixture3x3<uint8_t
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
                        framework::dataset::make("DataType",
@@ -310,7 +388,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<uint8_t
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W3x3
 
@@ -323,10 +401,23 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture1x1<uint8_t>, fr
                                                                                                                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W1x1
 
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture5x1<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data5x1, framework::dataset::make("DataType",
+                                                                                                                       DataType::QASYMM8)),
+                                                                                                                       data_layouts_dataset),
+                                                                                                                       input_qinfo_dataset),
+                                                                                                                       output_qinfo_dataset),
+                                                                                                                       add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
@@ -340,7 +431,7 @@ FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture4x4<int8_t>, fra
                                                                                                                       add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W4x4
 
@@ -354,7 +445,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDeconvolutionLayerQuantizedFixture3x3<int8_t>
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data3x3,
                        framework::dataset::make("DataType",
@@ -365,28 +456,164 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDeconvolutionLayerQuantizedFixture3x3<int8_t>
                        add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(W1x1)
-FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture1x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1, framework::dataset::make("DataType",
-                                                                                                                      DataType::QASYMM8_SIGNED)),
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture1x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data1x1,
+                                                                                                                      framework::dataset::make("DataType",
+                                                                                                                              DataType::QASYMM8_SIGNED)),
                                                                                                                       data_layouts_dataset),
                                                                                                                       input_qinfo_dataset),
                                                                                                                       output_qinfo_dataset),
                                                                                                                       add_bias_dataset))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num);
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
 }
 TEST_SUITE_END() // W1x1
 
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedFixture5x1<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(data5x1, framework::dataset::make("DataType",
+                                                                                                                      DataType::QASYMM8_SIGNED)),
+                                                                                                                      data_layouts_dataset),
+                                                                                                                      input_qinfo_dataset),
+                                                                                                                      output_qinfo_dataset),
+                                                                                                                      add_bias_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W5x1
+
 TEST_SUITE_END() // QASYMM8_SIGNED
+
+const auto input_qinfo_per_channel_dataset = framework::dataset::make("InputQuantizationInfo",
+{
+    QuantizationInfo(1.f / 255.f, 10)
+});
+const auto output_qinfo_per_channel_dataset = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(3.f / 255.f, 0)
+});
+const auto input_signed_qinfo_per_channel_dataset = framework::dataset::make("InputQuantizationInfo",
+{
+    QuantizationInfo(1.f / 255.f, -10)
+});
+const auto output_signed_qinfo_per_channel_dataset = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(3.f / 255.f, 10)
+});
+
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+
+TEST_SUITE(W4x4)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture4x4<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data4x4,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture4x4<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data4x4,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W4x4
+
+TEST_SUITE(W3x3)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture3x3<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data3x3,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture3x3<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data3x3,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W3x3
+
+TEST_SUITE(W1x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture1x1<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data1x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture1x1<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data1x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W1x1
+
+TEST_SUITE(W5x1)
+FIXTURE_DATA_TEST_CASE(Run, NEDeconvolutionLayerQuantizedPerChannelFixture5x1<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data5x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       data_layouts_dataset),
+                       input_qinfo_per_channel_dataset),
+                       output_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+FIXTURE_DATA_TEST_CASE(RunSigned, NEDeconvolutionLayerQuantizedPerChannelFixture5x1<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(data5x1,
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       data_layouts_dataset),
+                       input_signed_qinfo_per_channel_dataset),
+                       output_signed_qinfo_per_channel_dataset),
+                       add_bias_dataset),
+                       framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quantized, tolerance_num_quant);
+}
+TEST_SUITE_END() // W5x1
+
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
+
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // DeconvolutionLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DepthConcatenateLayer.cpp b/tests/validation/NEON/DepthConcatenateLayer.cpp
index eea7c4d82f..1c69d44a2b 100644
--- a/tests/validation/NEON/DepthConcatenateLayer.cpp
+++ b/tests/validation/NEON/DepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,7 +67,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {
diff --git a/tests/validation/NEON/DepthConvertLayer.cpp b/tests/validation/NEON/DepthConvertLayer.cpp
index 7af467be28..4972708144 100644
--- a/tests/validation/NEON/DepthConvertLayer.cpp
+++ b/tests/validation/NEON/DepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,25 +56,21 @@ const auto DepthConvertLayerU16toU8Dataset      = combine(framework::dataset::ma
 const auto DepthConvertLayerU16toU32Dataset     = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32));
 const auto DepthConvertLayerS16toU8Dataset      = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8));
 const auto DepthConvertLayerS16toS32Dataset     = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32));
-const auto DepthConvertLayerBF16toF32Dataset    = combine(framework::dataset::make("DataType", DataType::BFLOAT16), framework::dataset::make("DataType", DataType::F32));
 const auto DepthConvertLayerF16toU8Dataset      = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U8));
 const auto DepthConvertLayerF16toF32Dataset     = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
 const auto DepthConvertLayerF16toS32Dataset     = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S32));
 const auto DepthConvertLayerF32toF16Dataset     = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
 const auto DepthConvertLayerF32toS32Dataset     = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32));
 const auto DepthConvertLayerF32toU8Dataset      = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U8));
-const auto DepthConvertLayerF32toBF16Dataset    = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::BFLOAT16));
 
 const auto DepthConvertLayerS32toF32Dataset     = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F32));
 const auto DepthConvertLayerS32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::QASYMM8));
 const auto DepthConvertLayerS32toF16Dataset     = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F16));
 const auto DepthConvertLayerS32toU8Dataset      = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U8));
 
-const auto DepthConvertLayerF16toQASYMM8Dataset   = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8));
-const auto DepthConvertLayerF32toQASYMM8Dataset   = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
-const auto DepthConvertLayerShiftDatasetNightly   = framework::dataset::make("Shift", 0, 7);
-const auto DepthConvertLayerShiftDatasetPrecommit = framework::dataset::make("Shift", { 0, 3, 6 });
-const auto DepthConvertLayerZeroShiftDataset      = framework::dataset::make("Shift", 0);
+const auto DepthConvertLayerF16toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8));
+const auto DepthConvertLayerF32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
+const auto DepthConvertLayerZeroShiftDataset    = framework::dataset::make("Shift", 0);
 
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int32_t> tolerance_one_int32(1);
@@ -108,7 +104,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::WRAP,
                                                      })),
-               framework::dataset::make("Shift",{ 1, 1, 1, 1, 1, 1, 8, 1,
+               framework::dataset::make("Shift",{ 0, 0, 0, 1, 1, 1, 8, 1,
                                                      })),
                framework::dataset::make("Expected", { false, false, false, false, true})),
                input_info, output_info, policy, shift, expected)
@@ -129,8 +125,6 @@ using NEDepthConvertLayerToU8Fixture = DepthConvertLayerValidationFixture<Tensor
 template <typename T>
 using NEDepthConvertLayerToU32Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, uint32_t>;
 template <typename T>
-using NEDepthConvertLayerToBF16Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, bfloat16>;
-template <typename T>
 using NEDepthConvertLayerToF16Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, half>;
 template <typename T>
 using NEDepthConvertLayerToF32Fixture = DepthConvertLayerValidationFixture<Tensor, Accessor, NEDepthConvertLayer, T, float>;
@@ -188,7 +182,7 @@ TEST_SUITE_END() // QASYMM8_to_S32
 TEST_SUITE(U8_to_U16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toU16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -196,7 +190,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toU16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -206,7 +200,7 @@ TEST_SUITE_END() // U8_to_U16
 TEST_SUITE(U8_to_S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -214,7 +208,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToS16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toS16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -223,7 +217,7 @@ TEST_SUITE_END() // U8_to_S16
 TEST_SUITE(U8_to_S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS32Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -231,7 +225,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS32Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToS32Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toS32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -241,7 +235,7 @@ TEST_SUITE_END() // U8_to_S32
 TEST_SUITE(U8_to_F32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF32Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toF32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -249,7 +243,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF32Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToF32Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toF32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -260,7 +254,7 @@ TEST_SUITE_END() // U8_to_F32
 TEST_SUITE(U8_to_F16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toF16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -268,7 +262,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF16Fixture<uint8_t>, frame
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToF16Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU8toF16Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -279,14 +273,14 @@ TEST_SUITE_END() // U8_to_F36
 TEST_SUITE(U16_to_U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU8Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU8Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU8Fixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU16toU8Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -296,14 +290,14 @@ TEST_SUITE_END() // U16_to_U8
 TEST_SUITE(U16_to_U32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU32Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU32Dataset),
                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                       DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                       DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU32Fixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerU16toU32Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerShiftDatasetNightly))
+                                                                                                                     DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -313,14 +307,14 @@ TEST_SUITE_END() // U16_to_U32
 TEST_SUITE(S16_to_U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToU8Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toU8Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                     DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU8Fixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerS16toU8Dataset),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                   DepthConvertLayerShiftDatasetNightly))
+                                                                                                                   DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -330,42 +324,20 @@ TEST_SUITE_END() // S16_to_U8
 TEST_SUITE(S16_to_S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToS32Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
+                                                                                                                      DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToS32Fixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), DepthConvertLayerS16toS32Dataset),
                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                    DepthConvertLayerShiftDatasetNightly))
+                                                                                                                    DepthConvertLayerZeroShiftDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S16_to_S32
 
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-TEST_SUITE(BFLOAT16_to_F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToF32Fixture<bfloat16>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerBF16toF32Dataset),
-                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                       DepthConvertLayerZeroShiftDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // BFLOAT16_to_F32
-
-TEST_SUITE(F32_to_BFLOAT16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToBF16Fixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerF32toBF16Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     DepthConvertLayerZeroShiftDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // F32_to_BFLOAT16
-#endif           /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16_to_QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConvertLayerToQASYMM8Fixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
@@ -608,7 +580,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConvertLayerToU8Fixture<int32_t>, framew
 TEST_SUITE_END() // S32_to_U8
 
 TEST_SUITE_END() // DepthConvertLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DepthToSpaceLayer.cpp b/tests/validation/NEON/DepthToSpaceLayer.cpp
index abc8c4f266..46ac59e95c 100644
--- a/tests/validation/NEON/DepthToSpaceLayer.cpp
+++ b/tests/validation/NEON/DepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,7 +109,7 @@ TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE_END() // DepthToSpace
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index e4a136ea7b..e9609b7b72 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,32 +42,70 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-constexpr RelativeTolerance<float>   tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+constexpr RelativeTolerance<float>   tolerance_f32(0.01f);        /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);        /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8_SIGNED */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.02)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr float                     tolerance_num = 0.05f;                 /**< Tolerance number */
 #endif                                                                     // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-const auto depth_multipliers       = framework::dataset::make("DepthMultiplier", { 1, 2, 5 });
-const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 8 });
+const auto depth_multipliers       = make("DepthMultiplier", { 1, 2, 8 });
+const auto large_depth_multipliers = make("DepthMultiplier", { 5, 32 });
 
-//Activation Functions
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+// Activation Functions
+const auto NoActivation = make("ActivationInfo", ActivationLayerInfo());
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
 });
 
-const auto input_qinfo_dataset = framework::dataset::make("InputQInfo",
+const auto ActivationFunctionsDatasetNightly = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQUARE),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::HARD_SWISH),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 2.f, 1.f),
+#ifdef __aarch64__
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::GELU),
+#endif // __aarch64__
+});
+
+const auto ActivationFunctionsQuantizedSmallDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+});
+
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f, -0.5f),
+});
+
+// This is only used when there is fused activation
+const auto input_qinfo_dataset = make("InputQInfo",
 {
     QuantizationInfo(0.3f, 10),
     QuantizationInfo(2.2f, 10),
 });
+
+const auto IgnoredQuantizationInfo = make("IgnoredQuantizationInfo", QuantizationInfo());
+
 } // namespace
 
 TEST_SUITE(NEON)
@@ -76,7 +114,7 @@ TEST_SUITE(DepthwiseConvolutionLayer)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
+               make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                        TensorInfo(TensorShape(32U, 18U, 3U), 1, DataType::F32),     // Mismatching input feature maps
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Unsupported weights dimensions
                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching depth multiplier
@@ -88,7 +126,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // dilation < 1
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                      }),
-               framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
+               make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(5U, 5U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
@@ -100,7 +138,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F32),
                                                        })),
-               framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+               make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
@@ -112,7 +150,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+               make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
@@ -124,7 +162,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+               make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
@@ -136,7 +174,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                       PadStrideInfo(1, 1, 0, 0),
                                                       PadStrideInfo(1, 1, 0, 0),
                                                      })),
-               framework::dataset::make("DepthMultiplier", { 1,
+               make("DepthMultiplier", { 1,
                                                              1,
                                                              1,
                                                              3,
@@ -148,7 +186,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                              1,
                                                              1,
                                                             })),
-               framework::dataset::make("Dilation", { Size2D(1U, 1U),
+               make("Dilation", { Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
                                                       Size2D(1U, 1U),
@@ -160,7 +198,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
                                                       Size2D(0U, 1U),
                                                       Size2D(1U, 1U),
                                                             })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
+               make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
                input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
     bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false),
@@ -169,7 +207,7 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
 }
 
 DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
+                make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                         TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),     // Mismatching input feature maps
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching depth multiplier
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid biases size
@@ -177,10 +215,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid output size
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Patch size bigger than input width
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Dilation < 1
-                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Window shrinking
-                                                        TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8), // Window shrinking
                                                       }),
-                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
+                make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
@@ -188,10 +224,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8),
                                                         })),
-                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+                make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(4U), 1, DataType::F32),
@@ -199,10 +233,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(24U), 1, DataType::S32),
                                                        })),
-                framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
@@ -210,11 +242,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8),
                                                        })),
-                framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
+                make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
@@ -222,9 +251,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 1, 0),
                                                       })),
-                framework::dataset::make("DepthMultiplier", { 1,
+                make("DepthMultiplier", { 1,
                                                               1,
                                                               3,
                                                               1,
@@ -232,10 +260,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                               1,
                                                               2,
                                                               2,
-                                                              2,
-                                                              3,
                                                              })),
-                framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                make("Dilation", { Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
@@ -243,10 +269,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        Size2D(1U, 1U),
                                                        Size2D(25U, 1U),
                                                        Size2D(0U, 1U),
-                                                       Size2D(1U, 1U),
-                                                       Size2D(1U, 1U),
                                                              })),
-                framework::dataset::make("Expected", { false, false, false, false, false, false,false, false, false, false })),
+                make("Expected", { false, false, false, false, false, false, false, false})),
                 input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
     bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
@@ -256,45 +280,71 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
 // *INDENT-ON*
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
+template <typename T>
+using NEDepthwiseConvolutionLayerMixedDataLayoutFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T, true>;
+template <typename T>
+using NEDepthwiseConvolutionLayerVariableWeightsFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T, false, false, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(F32)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::F32),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsDatasetNightly))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                                                                       depth_multipliers),
-                                                                                                                       framework::dataset::make("DataType",
-                                                                                                                               DataType::F32)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                           depth_multipliers),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
-                                                                                                                     large_depth_multipliers),
-                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, NEDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                           make("DepthMultiplier", { 2 })),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                           large_depth_multipliers),
+                           make("DataType", DataType::F32)),
+                           make("DataLayout", { DataLayout::NHWC })),
+                           make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                                                                       depth_multipliers),
-                                                                                                                       framework::dataset::make("DataType",
-                                                                                                                               DataType::F32)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                                           depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                                                                     large_depth_multipliers),
-                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -302,43 +352,43 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                       depth_multipliers),
-                                                                                                                       framework::dataset::make("DataType",
-                                                                                                                               DataType::F32)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                           depth_multipliers),
+                           make("DataType",
+                                                    DataType::F32)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                       large_depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                       depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                           depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                       large_depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -347,33 +397,72 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsSmall3x3, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout3x3, NEDepthwiseConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType", DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   make("ActivationInfo", ActivationLayerInfo())))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsSmall5x5, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall5x5, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F32)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunVariableWeightsLarge3x3, NEDepthwiseConvolutionLayerVariableWeightsFixture<float>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F32)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -382,42 +471,57 @@ TEST_SUITE_END() // F32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::F16),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsDatasetNightly))
+{
+    validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                                                                      depth_multipliers),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::F16)),
-                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                           depth_multipliers),
+                           make("DataType",
+                                                    DataType::F16)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
-                                                                                                                    large_depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                        large_depth_multipliers),
+                                                                                                                        make("DataType",
+                                                                                                                                DataType::F16)),
+                                                                                                                        make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                                                                      depth_multipliers),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::F16)),
-                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                                           depth_multipliers),
+                                                   make("DataType", DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                                                                    large_depth_multipliers),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                                           large_depth_multipliers),
+                                                   make("DataType", DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16, tolerance_num);
 }
@@ -427,45 +531,45 @@ TEST_SUITE_END() // Generic
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                      depth_multipliers),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::F16)),
-                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                           depth_multipliers),
+                           make("DataType",
+                                                    DataType::F16)),
+                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                           ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                       large_depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
 
 TEST_SUITE(Dilation)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                       depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                           depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                       large_depth_multipliers),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+                                                           large_depth_multipliers),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -474,33 +578,33 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmallW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunSmallW5x5, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallW5x5, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLargeW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                               framework::dataset::make("DataType",
-                                                                        DataType::F16)),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLargeW3x3, NEDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                           make("DepthMultiplier", 1)),
+                                                   make("DataType",
+                                                                            DataType::F16)),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_f16);
 }
@@ -511,94 +615,162 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 template <typename T>
-using NEDepthwiseConvolutionLayerQuantizedFixtureOptimized = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
-template <typename T>
 using NEDepthwiseConvolutionLayerQuantizedFixture                    = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
+template <typename T>
+using NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture     = DepthwiseConvolutionLayerValidationQuantizedFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T, true>;
 using NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture = DepthwiseConvolutionLayerValidationQuantizedPerChannelFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, uint8_t, int8_t>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                       depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::QASYMM8),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout, NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        make("DepthMultiplier", { 2 }),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                       depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
 TEST_SUITE(Dilation)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -606,39 +778,69 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunMixedDataLayout3x3, NEDepthwiseConvolutionLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<uint8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NHWC }),
+        NoActivation))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -646,170 +848,269 @@ TEST_SUITE_END() // Optimized
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                       depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                       depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.8f, 1) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
-                                                                       large_depth_multipliers),
-                                                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmallWithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(),
+        depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.7f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(),
+        large_depth_multipliers,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixtureOptimized<int8_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                       framework::dataset::make("DepthMultiplier", 1)),
-                                                               framework::dataset::make("DataType",
-                                                                                        DataType::QASYMM8_SIGNED)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        NoActivation))
 {
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall5x5WithActivation, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset5x5(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        input_qinfo_dataset,
+        make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) }),
+        make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+        ActivationFunctionsQuantizedSmallDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+        make("DepthMultiplier", 1),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        IgnoredQuantizationInfo,
+        IgnoredQuantizationInfo,
+        make("DataLayout", { DataLayout::NCHW }),
+        NoActivation))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // Optimized
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM8_PER_CHANNEL)
+
+FIXTURE_DATA_TEST_CASE_NEW(RunActivations, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        make("In", TensorShape(33U, 27U, 11U, 3U)),
+        make("Weights", Size2D(3U, 4U)),
+        make("Info", PadStrideInfo(1, 2, 0, 1)),
+        make("Dilation", Size2D(2U, 2U)),
+        make("DepthMultiplier", { 5 }),
+        make("InputDataType", DataType::QASYMM8),
+        make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL),
+        make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) }),
+        make("DstQuantizationInfo", { QuantizationInfo(0.05f, 4) }),
+        make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW }),
+        ActivationFunctionsQuantizedDataset))
+{
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                                                                               depth_multipliers),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                                                   depth_multipliers),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 
 TEST_SUITE(Dilation)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
-                                                                               depth_multipliers),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
+                                                                                   depth_multipliers),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
-                                                                               depth_multipliers),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(),
+                                                                                   depth_multipliers),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -817,27 +1118,27 @@ TEST_SUITE_END() // Dilation
 TEST_SUITE_END() // Generic
 
 TEST_SUITE(Optimized)
-FIXTURE_DATA_TEST_CASE(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                               framework::dataset::make("DepthMultiplier", 1)),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::PRECOMMIT,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::SmallOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                                                   make("DepthMultiplier", 1)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
-                                                                               framework::dataset::make("DepthMultiplier", 1)),
-                                                                       framework::dataset::make("InputDataType", DataType::QASYMM8)),
-                                                               framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
-                                                       input_qinfo_dataset),
-                                               framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
-                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-                               ActivationFunctionsDataset))
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge3x3, NEDepthwiseConvolutionLayerQuantizedSymmetricPerChannelFixture, framework::DatasetMode::NIGHTLY,
+                           combine(combine(combine(combine(combine(combine(combine(datasets::LargeOptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                                                   make("DepthMultiplier", 1)),
+                                                                           make("InputDataType", DataType::QASYMM8)),
+                                                                   make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)),
+                                                           input_qinfo_dataset),
+                                                   make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })),
+                                           make("DataLayout", { DataLayout::NHWC })),
+                                   make("ActivationInfo", { ActivationLayerInfo() })))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
@@ -846,7 +1147,7 @@ TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // DepthwiseConvLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
index 64f6a93255..221fc5d249 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/framework/Macros.h"
@@ -37,12 +38,12 @@ namespace validation
 {
 using namespace arm_compute::misc::shape_calculator;
 
-// Create function for NEDepthwiseConvolutionLayerKernel
-using NEDepthwiseConvolutionLayerNative = NESynthetizeFunctionWithZeroConstantKernelBorder<NEDepthwiseConvolutionLayerNativeKernel>;
+// Create function for CpuDepthwiseConvolutionKernel
+using CpuDepthwiseConvolutionNative = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuDepthwiseConv2dNativeKernel>;
 
 // Fixture for NEDepthwiseConvolutionLayerKernel
 template <typename T>
-using NEDepthwiseConvolutionLayerNativeFixture = DepthwiseConvolutionLayerNativeValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayerNative, T>;
+using CpuDepthwiseConvolutionNativeFixture = DepthwiseConvolutionLayerNativeValidationFixture<Tensor, Accessor, CpuDepthwiseConvolutionNative, T>;
 
 namespace
 {
@@ -98,78 +99,83 @@ const auto data_type_values = framework::dataset::make("data_type", { DataType::
 
 /** Data layout values to test - All */
 const auto data_layout_values = framework::dataset::make("data_layout", { DataLayout::NHWC });
+} // namespace
 
-/** Configuration test */
-void validate_configuration(size_t width_value, size_t height_value, size_t channel_value, size_t batch_value, Size2D kernel_sz_value, size_t depth_multiplier_value, Size2D dilation_value, Size2D stride_value, bool padding_valid_value, DataType data_type_value, DataLayout data_layout_value)
+TEST_SUITE(NEON)
+TEST_SUITE(DepthwiseConvolutionLayerNative)
+
+TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
 {
-    TensorShape src_shape(width_value, height_value, channel_value, batch_value);
-    TensorShape weights_shape(kernel_sz_value.width, kernel_sz_value.height, channel_value * depth_multiplier_value);
-    TensorShape biases_shape(channel_value * depth_multiplier_value);
+    // this test case will ensure that the kernel is not adding implicit padding
+    constexpr uint32_t vector_size = 8; // Asummed vector size of the current native kernel
+    constexpr auto     depth = vector_size * 2 + 1; // mis-aligned depth to force padding if exists.
+    constexpr auto     data_layout = DataLayout::NHWC;
+    constexpr auto     data_type = DataType::F32;
+
+    const auto input_size  = Size2D{ 100, 100 }; // random plane size of the input
+    const auto kernel_size = Size2D{ 4, 4 }; // random plane size of the kernel
+    const auto pad_stride_info = PadStrideInfo(3, 3); // random convolution information to
+
+    TensorShape src_shape{ depth, input_size.x(), input_size.y() };
+    TensorShape weights_shape{ depth, kernel_size.x(), kernel_size.y() };
+    TensorShape bias_shape{ depth };
+
+    auto src     = create_tensor<Tensor>(src_shape, data_type, 1, QuantizationInfo(), data_layout);
+    auto weights = create_tensor<Tensor>(weights_shape, data_type, 1, QuantizationInfo(), data_layout);
+    auto biases  = create_tensor<Tensor>(bias_shape, data_type, 1, QuantizationInfo(), data_layout);
+    auto dst     = create_tensor<Tensor>(TensorShape(), data_type, 1, QuantizationInfo(), data_layout);
+
+    cpu::kernels::CpuDepthwiseConv2dNativeKernel dwc;
+    const ConvolutionInfo info{pad_stride_info, 1, ActivationLayerInfo(), Size2D(1, 1)};
+    dwc.configure(src.info(), weights.info(), biases.info(), dst.info(), info);
+
+    ARM_COMPUTE_EXPECT(src.info()->padding().empty(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->padding().empty(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(biases.info()->padding().empty(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->padding().empty(), framework::LogLevel::ERRORS);
+}
 
-    if(data_layout_value == DataLayout::NHWC)
-    {
-        permute(src_shape, PermutationVector(2U, 0U, 1U, 3U));
-        permute(weights_shape, PermutationVector(2U, 0U, 1U));
-    }
+TEST_SUITE(KERNEL_SELECTION)
+DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL,
+               combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8_SIGNED,
+                                                              DataType::QASYMM8,
+                                                              DataType::QSYMM8_PER_CHANNEL
+                                                            })),
+                       framework::dataset::make("DataType_per_channel", { DataType::QASYMM8,
+                                                                          DataType::QASYMM8_SIGNED
+                                                            })),
+                cpu_ext, data_type, data_type_per_channel)
+{
+    using namespace cpu::kernels;
 
-    TensorInfo src_info(src_shape, 1, data_type_value);
-    TensorInfo weights_info(weights_shape, 1, data_type_value);
-    TensorInfo biases_info(biases_shape, 1, data_type_value);
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
 
-    src_info.set_data_layout(data_layout_value);
-    weights_info.set_data_layout(data_layout_value);
-    biases_info.set_data_layout(data_layout_value);
+    const auto *selected_impl = CpuDepthwiseConv2dNativeKernel::get_implementation(
+        DepthwiseConv2dNativeDataTypeISASelectorData{ data_type, data_type_per_channel,cpu_isa },
+        cpu::KernelSelectionType::Preferred );
 
-    PadStrideInfo conv_info;
-    if(padding_valid_value)
-    {
-        conv_info = PadStrideInfo();
-    }
-    else
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string per_channel_str = "_";
+    if (data_type == DataType::QSYMM8_PER_CHANNEL)
     {
-        conv_info = calculate_same_pad(src_shape, weights_shape, PadStrideInfo(stride_value.width, stride_value.height), data_layout_value, dilation_value);
+        per_channel_str = "_" + cpu_impl_dt(data_type_per_channel) + "_" ;
     }
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type)  + per_channel_str + "deptwiseconv2dnative";
+    std::string actual   = selected_impl->name;
 
-    const TensorShape dst_shape = compute_depthwise_convolution_shape(src_info, weights_info, conv_info, depth_multiplier_value, dilation_value);
-
-    // Create tensors
-    Tensor src      = create_tensor<Tensor>(src_shape, data_type_value, 1, QuantizationInfo(), data_layout_value);
-    Tensor weights  = create_tensor<Tensor>(weights_shape, data_type_value, 1, QuantizationInfo(), data_layout_value);
-    Tensor biases   = create_tensor<Tensor>(biases_shape, data_type_value, 1, QuantizationInfo(), data_layout_value);
-    Tensor dst      = create_tensor<Tensor>(dst_shape, data_type_value, 1, QuantizationInfo(), data_layout_value);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(biases.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEDepthwiseConvolutionLayerNative dwc;
-    dwc.configure(&src, &weights, &biases, &dst, conv_info, depth_multiplier_value, dilation_value);
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
 }
-} // namespace
+TEST_SUITE_END() // KERNEL_SELECTION
 
-TEST_SUITE(NEON)
-TEST_SUITE(DepthwiseConvolutionLayerNative)
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_precommit,
-                                                                                                                                           height_values_precommit),
-                                                                                                                                           channel_values_precommit),
-                                                                                                                                           batch_values_precommit),
-                                                                                                                                           kernel_sz_values_precommit),
-                                                                                                                                           depth_multiplier_values),
-                                                                                                                                           dilation_values),
-                                                                                                                                           stride_values),
-                                                                                                                                           padding_valid_values),
-                                                                                                                                           data_type_values),
-                                                                                                                                           data_layout_values),
-width_value, height_value, channel_value, batch_value, kernel_sz_value, depth_multiplier_value, dilation_value, stride_value, padding_valid_value, data_type_value, data_layout_value)
-{
-    validate_configuration(width_value, height_value, channel_value, batch_value, kernel_sz_value, depth_multiplier_value, dilation_value, stride_value, padding_valid_value, data_type_value, data_layout_value);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
+FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CpuDepthwiseConvolutionNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_precommit,
                                                                                                 height_values_precommit),
                                                                                                 channel_values_precommit),
@@ -186,7 +192,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<float>
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::NIGHTLY,
+FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CpuDepthwiseConvolutionNativeFixture<float>, framework::DatasetMode::NIGHTLY,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_nightly,
                                                                                                 height_values_nightly),
                                                                                                 channel_values_nightly),
@@ -206,7 +212,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerNativeFixture<float>
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // DepthwiseConvolutionLayerNative
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/NEON/DequantizationLayer.cpp b/tests/validation/NEON/DequantizationLayer.cpp
index 4389419d73..82a2149152 100644
--- a/tests/validation/NEON/DequantizationLayer.cpp
+++ b/tests/validation/NEON/DequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,32 +117,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallShapes(), data_types),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::QASYMM8, 1, QuantizationInfo(0.5f, -10));
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEDequantizationLayer dequant_layer;
-    dequant_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 template <typename T>
 using NEDequantizationLayerFixture = DequantizationValidationFixture<Tensor, Accessor, NEDequantizationLayer, T>;
 
@@ -175,7 +149,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDequantizationLayerFixture<float>, framework:
 TEST_SUITE_END() // FP32
 
 TEST_SUITE_END() // DequantizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Derivative.cpp b/tests/validation/NEON/Derivative.cpp
deleted file mode 100644
index bc778dea0e..0000000000
--- a/tests/validation/NEON/Derivative.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/GradientDimensionDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/DerivativeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Derivative)
-
-using NEDerivativeFixture = DerivativeValidationFixture<Tensor, Accessor, NEDerivative, uint8_t, int16_t>;
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create Derivative configure function
-    NEDerivative derivative;
-    derivative.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 1 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // TODO(COMPMID-415) Validate padding after fixing x-access input bug in NEON kernel
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDerivativeFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                         Format::U8)),
-                                                                                                 datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDerivativeFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-TEST_SUITE_END() // Derivative
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/DetectionPostProcessLayer.cpp b/tests/validation/NEON/DetectionPostProcessLayer.cpp
index 4413ed470f..7d725327b7 100644
--- a/tests/validation/NEON/DetectionPostProcessLayer.cpp
+++ b/tests/validation/NEON/DetectionPostProcessLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -150,7 +150,7 @@ inline void base_test_case(DetectionPostProcessLayerInfo info, DataType data_typ
         quantize_and_fill_tensor(Accessor(anchors), anchors_vector);
     }
 
-    // Determine the output through the NEON kernel
+    // Determine the output through the Compute Library operator
     Tensor                      output_boxes;
     Tensor                      output_classes;
     Tensor                      output_scores;
@@ -384,7 +384,7 @@ TEST_CASE(Quantized_regular, framework::DatasetMode::ALL)
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE_END() // DetectionPostProcessLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Dilate.cpp b/tests/validation/NEON/Dilate.cpp
deleted file mode 100644
index b75848b02f..0000000000
--- a/tests/validation/NEON/Dilate.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEDilate.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/DilateFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Dilate)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEDilate dilate;
-    dilate.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEDilateFixture = DilateValidationFixture<Tensor, Accessor, NEDilate, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDilateFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                              DataType::U8)),
-                                                                                                      datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDilateFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)),
-                                                                                                    datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END() // Dilate
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/DilatedConvolutionLayer.cpp b/tests/validation/NEON/DilatedConvolutionLayer.cpp
index 97afa24ed5..fbfe8b8a7a 100644
--- a/tests/validation/NEON/DilatedConvolutionLayer.cpp
+++ b/tests/validation/NEON/DilatedConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/cpu/operators/CpuConv2d.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/DilatedConvolutionLayerDataset.h"
@@ -49,7 +50,7 @@ const AbsoluteTolerance<float>            abs_tolerance_f16(0.3f);
 const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr float                           tolerance_num_f16 = 0.07f;                 /**< Tolerance number for FP16 */
 #endif                                                                               /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8(1);                           /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
@@ -96,7 +97,7 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
                                           framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
                input_info, weights_info, output_info, conv_info, dilation, expected)
 {
-    ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
+    ConvolutionMethod is_valid = cpu::CpuConv2d::get_convolution_method(&input_info.clone()->set_is_resizable(false),
                                                                             &weights_info.clone()->set_is_resizable(false),
                                                                             &output_info.clone()->set_is_resizable(false),
                                                                             conv_info, WeightsInfo(), dilation);
@@ -108,49 +109,6 @@ TEST_SUITE_END() // DilatedConvolutionLayer
 
 TEST_SUITE(GEMMDilatedConvolutionLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallDilatedConvolutionLayerDataset(),
-                                                                   CNNDataTypes),
-               input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type)
-{
-    auto bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
-
-    // Create tensors
-    Tensor src     = create_tensor<Tensor>(input_shape, data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-    Tensor weights = create_tensor<Tensor>(weights_shape, data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-    Tensor bias    = create_tensor<Tensor>(bias_shape, bias_data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-    Tensor dst     = create_tensor<Tensor>(output_shape, data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    const QuantizationInfo src_quantization_info     = src.info()->quantization_info();
-    const QuantizationInfo weights_quantization_info = weights.info()->quantization_info();
-
-    // Create and configure function
-    NEGEMMConvolutionLayer conv;
-    conv.configure(&src, &weights, &bias, &dst, info, WeightsInfo(), dilation);
-
-    // Validate valid region
-    const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
-    const ValidRegion weights_valid_region = shape_to_valid_region(weights_shape);
-    const ValidRegion bias_valid_region    = shape_to_valid_region(bias_shape);
-    const ValidRegion dst_valid_region     = shape_to_valid_region(output_shape);
-
-    validate(src.info()->valid_region(), src_valid_region);
-    validate(weights.info()->valid_region(), weights_valid_region);
-    validate(bias.info()->valid_region(), bias_valid_region);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate QuantizationInfo
-    ARM_COMPUTE_EXPECT(src.info()->quantization_info() == src_quantization_info, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->quantization_info() == weights_quantization_info, framework::LogLevel::ERRORS);
-
-    // Validate padding
-    //TODO(COMPMID-415) Need to validate padding?
-}
-
 template <typename T>
 using NEGEMMDilatedConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
 
@@ -204,13 +162,18 @@ template <typename T>
 using NEGEMMDilatedConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
 
 TEST_SUITE(Quantized)
+/// @note: Every asymmetric quantized test where there's no fused activation will have its quantization info ignored
+/// This is because instead of using the same quantization information for all the tensors, the fixture generates
+/// separate quantization info for each input and the output tensor.
+/// When we can also support dynamic quantization with the presence of activation, we can remove the explicit
+/// quantization info.
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
 {
     // Validate output
@@ -221,7 +184,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMDilatedConvolutionLayerQuantizedFixture<u
                                                                framework::dataset::make("ReshapeWeights", { true })),
                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("IgnoredQuantizationInfo", { QuantizationInfo() })),
                                framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
 {
     // Validate output
@@ -231,7 +194,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // GEMMDilatedConvolutionLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index 05bfbc171a..0779c9d388 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -69,8 +73,8 @@ const auto data_pad_f16 = concat(combine(framework::dataset::make("PadX", { 0, 1
                                                  framework::dataset::make("KernelSize", 1))));
 
 const auto data_f32 = combine(datasets::SmallDirectConvolutionShapes(),
-                              combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
-                                      combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
+                              combine(framework::dataset::make("StrideX", { 1, 2, 3, 4 }),
+                                      combine(framework::dataset::make("StrideY", { 1, 2, 3, 4 }),
                                               data_pad_f32)));
 
 const auto data_f16 = combine(datasets::SmallDirectConvolutionShapes(),
@@ -78,25 +82,52 @@ const auto data_f16 = combine(datasets::SmallDirectConvolutionShapes(),
                                       combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
                                               data_pad_f16)));
 
-const auto data = combine(datasets::SmallDirectConvolutionShapes(),
-                          combine(framework::dataset::make("StrideX", { 1 }),
-                                  combine(framework::dataset::make("StrideY", { 1 }),
-                                          combine(framework::dataset::make("PadX", { 1 }),
-                                                  combine(framework::dataset::make("PadY", { 1 }),
-                                                          framework::dataset::make("KernelSize", 3))))));
+const auto data_prec = combine(datasets::SmallDirectConvolutionShapes(),
+                               combine(framework::dataset::make("StrideX", { 1 }),
+                                       combine(framework::dataset::make("StrideY", { 1 }),
+                                               combine(framework::dataset::make("PadX", { 1 }),
+                                                       combine(framework::dataset::make("PadY", { 1 }),
+                                                               framework::dataset::make("KernelSize", 3))))));
 
 const auto data9x9 = combine(datasets::SmallDirectConvolutionShapes(),
-                             combine(framework::dataset::make("StrideX", { 1 }),
-                                     combine(framework::dataset::make("StrideY", { 1 }),
+                             combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+                                     combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
                                              combine(framework::dataset::make("PadX", { 0, 2 }),
                                                      combine(framework::dataset::make("PadY", { 0, 3 }),
                                                              framework::dataset::make("KernelSize", 9))))));
 
-const auto data_f32_nightly = combine(data_f32, framework::dataset::make("NumKernels", { 1, 4 }));
-const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKernels", { 1, 4 }));
+const auto data8x8 = combine(datasets::SmallDirectConvolutionShapes(),
+                             combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+                                     combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
+                                             combine(framework::dataset::make("PadX", { 0 }),
+                                                     combine(framework::dataset::make("PadY", { 0 }),
+                                                             framework::dataset::make("KernelSize", 8))))));
 
-const auto data_precommit    = combine(data, framework::dataset::make("NumKernels", { 1 }));
+const auto data_f32_nightly = combine(data_f32, framework::dataset::make("NumKernels", { 1, 4, 5 }));
+const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKernels", { 1, 4, 5 }));
+
+const auto data_precommit    = combine(data_prec, framework::dataset::make("NumKernels", { 1 }));
 const auto data_precommit9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 4 }));
+const auto data_precommit8x8 = combine(data8x8, framework::dataset::make("NumKernels", { 4 }));
+
+/* The following tests is from real use-case that made DirectConvolution
+ * overflows in terms of its tensor indexing. This test case is using
+ * a separate tolerance due to the following reason.
+ * - It has shown that it requires generally larger absolute tolerance
+ *   for large numbers or larger relative tolerance for small numbers.
+ * - With the first reason, since it is mainly testing index overflow,
+ *   a value with a margin is used to avoid uninteded test failures
+ *   during nightly.
+ */
+constexpr AbsoluteTolerance<float> usecase_tolerance_fp32(0.05f);
+
+const auto data_nightly_usecase = combine(framework::dataset::make("InputShape", { TensorShape{ 3U, 800U, 800U } }),
+                                          combine(framework::dataset::make("StrideX", { 1 }),
+                                                  combine(framework::dataset::make("StrideY", { 1 }),
+                                                          combine(framework::dataset::make("PadX", { 4 }),
+                                                                  combine(framework::dataset::make("PadY", { 4 }),
+                                                                          combine(framework::dataset::make("KernelSize", 9),
+                                                                                  framework::dataset::make("NumKernels", { 16 })))))));
 
 /** Activation function Dataset*/
 const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
@@ -109,17 +140,95 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
 TEST_SUITE(NEON)
 TEST_SUITE(DirectConvolutionLayer)
 
+/** Check whether the configuration of a Direct Convolution layer with no
+ * bias leads to a successful execution.
+ */
+TEST_CASE(NoBias, framework::DatasetMode::PRECOMMIT)
+{
+    const auto     src_shape     = TensorShape(27U, 13U, 2U);
+    const auto     weights_shape = TensorShape(3U, 3U, 2U, 4U);
+    const auto     bias_shape    = TensorShape(4U);
+    const auto     dst_shape     = TensorShape(25U, 11U, 4U);
+    constexpr auto dt            = DataType::F32;
+
+    auto src     = create_tensor<Tensor>(src_shape, dt);
+    auto weights = create_tensor<Tensor>(weights_shape, dt);
+    auto dst     = create_tensor<Tensor>(dst_shape, dt);
+
+    const auto conv_info = PadStrideInfo(1, 1, 0, 0);
+
+    // Create Direct Convolution function
+    NEDirectConvolutionLayer conv{};
+    conv.configure(&src, &weights, nullptr, &dst, conv_info);
+
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    library->fill_tensor_value(Accessor(src), 1.f);
+    library->fill_tensor_value(Accessor(weights), 1.f);
+
+    conv.run();
+
+    // Compute reference to compare
+    SimpleTensor<float> ref_src{ src_shape, dt };
+    SimpleTensor<float> ref_weights{ weights_shape, dt };
+    SimpleTensor<float> ref_bias{ bias_shape, dt };
+    library->fill_tensor_value(ref_src, 1.f);
+    library->fill_tensor_value(ref_weights, 1.f);
+    // No bias
+    library->fill_tensor_value(ref_bias, 0.f);
+    auto ref_dst = reference::convolution_layer<float>(ref_src, ref_weights, ref_bias, dst_shape, conv_info);
+
+    validate(Accessor(dst), ref_dst);
+}
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               concat(combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                      framework::dataset::make("DataType", { DataType::F32 })),
+                              framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                      combine(combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                      framework::dataset::make("DataType", { DataType::F16 })),
+                              framework::dataset::make("DataLayout", { DataLayout::NCHW }))),
+               cpu_ext, data_type, data_layout)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ data_type, data_layout, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string data_layout_str;
+    if(data_layout == DataLayout::NCHW)
+    {
+        data_layout_str = "nchw";
+    }
+    else
+    {
+        data_layout_str = "nhwc";
+    }
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_" + data_layout_str + "_directconv2d";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching data type input/weights
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching input feature maps
+        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid: Mismatching data type input/weights
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid: Mismatching input feature maps
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported kernel width
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Non-rectangular weights dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported non-rectangular weights dimensions
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid weights dimensions
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid stride
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid biases size
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid biases dimensions
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported stride
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported biases size
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported biases dimensions
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Invalid output size
                                               }),
         framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F16),
@@ -165,7 +274,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
                                                        framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
 })),
         framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false })),
         input_info, weights_info, biases_info, output_info, conv_info, act_info, expected)
@@ -176,10 +292,47 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-//TODO(COMPMID-415): Configuration tests?
+DATA_TEST_CASE(NoPaddingNHWCKernel, framework::DatasetMode::ALL, combine(combine(combine(data_precommit,
+                                                                                         framework::dataset::make("DataType", DataType::F32)),
+                                                                                 ActivationFunctionsDataset),
+                                                                         framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+
+               shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, act_info, data_layout)
+{
+    TensorShape         input_shape = TensorShape(shape);
+    TensorShape         weights_shape(kernel_size, kernel_size, input_shape.z(), num_kernels);
+    const PadStrideInfo info(stride_x, stride_y, pad_x, pad_y, DimensionRoundingType::FLOOR);
+
+    TensorInfo input_info   = TensorInfo(input_shape, 1, data_type);
+    TensorInfo weights_info = TensorInfo(weights_shape, 1, data_type);
+
+    TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, info);
+
+    if(data_layout == DataLayout::NHWC)
+    {
+        permute(input_shape, PermutationVector(2U, 0U, 1U));
+        permute(weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(output_shape, PermutationVector(2U, 0U, 1U));
+    }
+
+    // Create tensors
+    Tensor src     = create_tensor<Tensor>(input_shape, data_type, 1, QuantizationInfo(), data_layout);
+    Tensor weights = create_tensor<Tensor>(weights_shape, data_type, 1, QuantizationInfo(), data_layout);
+    Tensor dst     = create_tensor<Tensor>(output_shape, data_type, 1, QuantizationInfo(), data_layout);
+
+    // Create and configure function
+    NEDirectConvolutionLayer conv;
+    conv.configure(&src, &weights, nullptr, &dst, info, act_info);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(weights.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
 
 template <typename T>
 using NEDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<Tensor, Accessor, NEDirectConvolutionLayer, T>;
+template <typename T>
+using NEDirectConvolutionLayerMixedDataLayoutFixture = DirectConvolutionValidationFixture<Tensor, Accessor, NEDirectConvolutionLayer, T, true>;
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -211,6 +364,24 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectConvolutionLayerFixture<float>, framewo
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEDirectConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit,
+                       framework::dataset::make("DataType", DataType::F32)),
+                       ActivationFunctionsDataset),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall8x8, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit8x8, framework::dataset::make("DataType",
+                                                                                                                       DataType::F32)),
+                                                                                                                       ActivationFunctionsDataset),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall9x9, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit9x9, framework::dataset::make("DataType",
                                                                                                                        DataType::F32)),
                                                                                                                        ActivationFunctionsDataset),
@@ -227,10 +398,18 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDirectConvolutionLayerFixture<float>, framewo
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
+FIXTURE_DATA_TEST_CASE(RunLargeUsecase, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_usecase, framework::dataset::make("DataType",
+                       DataType::F32)),
+                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, usecase_tolerance_fp32);
+}
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // DirectConvolutionLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
index 3aee46fa93..0667ac73f9 100644
--- a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
+++ b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,33 +46,23 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(AbsLayer)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEAbsLayer neg_layer;
-    neg_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEAbsLayerFixture = AbsValidationFixture<Tensor, Accessor, NEAbsLayer, T>;
 
+template <typename T>
+using NEAbsLayerQuantizedFixture = AbsQuantizedValidationFixture<Tensor, Accessor, NEAbsLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -127,8 +117,34 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEAbsLayerFixture<int32_t>, framework::DatasetM
 TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.075, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // AbsLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseDivision.cpp b/tests/validation/NEON/ElementwiseDivision.cpp
index f5e1f86dbc..95db4ad5fd 100644
--- a/tests/validation/NEON/ElementwiseDivision.cpp
+++ b/tests/validation/NEON/ElementwiseDivision.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,12 @@ namespace validation
 namespace
 {
 RelativeTolerance<float> tolerance_fp32(0.000001f);
+AbsoluteTolerance<int>   tolerance_zero_s32(0); // Tolerance for S32 division
+
 /** Input data sets **/
+const auto ElementwiseDivisionS32Dataset = combine(combine(framework::dataset::make("DataType", DataType::S32),
+                                                           framework::dataset::make("DataType", DataType::S32)),
+                                                   framework::dataset::make("DataType", DataType::S32));
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<half> tolerance_fp16(static_cast<half>(0.01f));
 const auto              ElementwiseDivisionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
@@ -51,6 +56,8 @@ const auto              ElementwiseDivisionFP16Dataset = combine(combine(framewo
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseDivisionFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                     framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -88,10 +95,41 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+// Test test cases will execute the function with dynamic-stated shapes
+// Since other elementwise operations share the same kernel, this tests are added only here.
+// Also, only FP32 is tested since data type doesn't/shouldn't matter with dynamic shapes.
+TEST_SUITE(DynamicShape)
+template <typename T>
+using CpuElementwiseDivisionDynamicShapeFixture = ArithmeticDivisionDynamicShapeValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
+
+template <typename T>
+using CpuElementwiseDivisionBroadcastDynamicShapeFixture = ArithmeticDivisionBroadcastDynamicShapeValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
+
+TEST_SUITE(F32)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuElementwiseDivisionDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset),
+                                                                                                                        InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CpuElementwiseDivisionBroadcastDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseDivisionFP32Dataset),
+                       OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+
+TEST_SUITE_END() // F32
+TEST_SUITE_END() // DynamicShape
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionFP16Dataset),
+                                                                                                          InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16, 0.01);
@@ -100,24 +138,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    NEElementwiseDivision add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset),
+                                                                                                           InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -126,8 +148,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework:
 template <typename T>
 using NEElementwiseDivisionBroadcastFixture = ArithmeticDivisionBroadcastValidationFixture<Tensor, Accessor, NEElementwiseDivision, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseDivisionBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseDivisionFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseDivisionBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseDivisionFP32Dataset),
+                       OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseDivisionBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwiseDivisionFP32Dataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -135,6 +165,17 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseDivisionBroadcastFixture<
 TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Integer)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseDivisionS32Dataset),
+                                                                                                             InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_zero_s32);
+}
+TEST_SUITE_END() // S32
+TEST_SUITE_END() // Integer
+
 TEST_SUITE_END() // ElementwiseDivision
 TEST_SUITE_END() // NEON
 } // namespace validation
diff --git a/tests/validation/NEON/ElementwiseExpLayer.cpp b/tests/validation/NEON/ElementwiseExpLayer.cpp
index 37a8bb7683..31cd78626f 100644
--- a/tests/validation/NEON/ElementwiseExpLayer.cpp
+++ b/tests/validation/NEON/ElementwiseExpLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,32 +46,25 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
+
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(ExpLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEExpLayer exp_layer;
-    exp_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEExpLayerFixture = ExpValidationFixture<Tensor, Accessor, NEExpLayer, T>;
 
+template <typename T>
+using NEExpLayerQuantizedFixture = ExpQuantizedValidationFixture<Tensor, Accessor, NEExpLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -101,8 +94,34 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEExpLayerFixture<float>, framework::DatasetMod
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEExpLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.01, 0) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.003, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEExpLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.02, -1) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.002, -2) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // ExpLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseKernelSelection.cpp b/tests/validation/NEON/ElementwiseKernelSelection.cpp
new file mode 100644
index 0000000000..7990a51936
--- /dev/null
+++ b/tests/validation/NEON/ElementwiseKernelSelection.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuElementwiseKernel.h"
+#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(KernelSelection)
+
+DATA_TEST_CASE(KernelSelection_elementwise_unary, framework::DatasetMode::ALL, concat(
+                   combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                           framework::dataset::make("DataType", { DataType::F32,
+                                                                  DataType::F16,
+                                                                  DataType::S32
+                                                                })),
+                   combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                           framework::dataset::make("DataType", { DataType::F32,
+                                                                  DataType::F16,
+                                                                  DataType::S32
+                                                                }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_elementwise_unary";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(KernelSelection_elementwise_arithmetic, framework::DatasetMode::ALL, concat(concat(
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16,
+                                                                                                               DataType::QASYMM8,
+                                                                                                               DataType::QASYMM8_SIGNED
+                                                                                                                                            })),
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16
+                                                                                                                                            }))),
+                                                                                           combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                                                                                                   framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                                                                           DataType::QASYMM8_SIGNED
+                                                                                                                                        }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuArithmeticKernel::get_implementation(
+                                    ElementwiseDataTypeISASelectorData{ data_type, cpu_isa, static_cast<int>(ArithmeticOperation::ADD) },
+                                    cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_arithmetic";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(KernelSelection_elementwise_comparison, framework::DatasetMode::ALL, concat(concat(
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16,
+                                                                                                               DataType::U8,
+                                                                                                               DataType::QASYMM8,
+                                                                                                               DataType::QASYMM8_SIGNED
+                                                                                                                                            })),
+                                                                                               combine(framework::dataset::make("CpuExt", std::string("SVE")),
+                                                                                                       framework::dataset::make("DataType", { DataType::F32,
+                                                                                                               DataType::F16,
+                                                                                                               DataType::S32,
+                                                                                                               DataType::S16,
+                                                                                                               DataType::U8
+                                                                                                                                            }))),
+                                                                                           combine(framework::dataset::make("CpuExt", std::string("SVE2")),
+                                                                                                   framework::dataset::make("DataType", { DataType::QASYMM8,
+                                                                                                           DataType::QASYMM8_SIGNED
+                                                                                                                                        }))),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuComparisonKernel::get_implementation(
+                                    ElementwiseDataTypeISASelectorData{ data_type, cpu_isa, static_cast<int>(ComparisonOperation::Equal) },
+                                    cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_comparison";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END() // Neon
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseLog.cpp b/tests/validation/NEON/ElementwiseLog.cpp
index 870c12e946..1175903dac 100644
--- a/tests/validation/NEON/ElementwiseLog.cpp
+++ b/tests/validation/NEON/ElementwiseLog.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,32 +46,25 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
+
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(LogLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NELogLayer log_layer;
-    log_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NELogLayerFixture = LogValidationFixture<Tensor, Accessor, NELogLayer, T>;
 
+template <typename T>
+using NELogLayerQuantizedFixture = LogQuantizedValidationFixture<Tensor, Accessor, NELogLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -107,8 +100,35 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NELogLayerFixture<float>, framework::DatasetMod
 }
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(10.5, 0), QuantizationInfo(0.5, -10)  })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.75, -128) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(12.5, -2) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // LogLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseMax.cpp b/tests/validation/NEON/ElementwiseMax.cpp
index bd61ba50a3..61421ab3e5 100644
--- a/tests/validation/NEON/ElementwiseMax.cpp
+++ b/tests/validation/NEON/ElementwiseMax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,8 @@ const auto ElementwiseMaxFP16Dataset = combine(combine(framework::dataset::make(
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseMaxFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -111,7 +113,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMaxS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseMaxS32Dataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -119,7 +122,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int32_t>, framework::Da
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxS16Dataset),
+                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -131,11 +135,12 @@ using NEElementwiseMaxQuantizedFixture = ElementwiseMaxValidationQuantizedFixtur
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMaxQASYMM8Dataset),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -144,11 +149,13 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<uint8_t>, fram
 template <typename T>
 using NEElementwiseMaxQuantizedBroadcastFixture = ElementwiseMaxQuantizedBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMax, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseMaxQASYMM8Dataset),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                               ElementwiseMaxQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -156,16 +163,26 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxQuantizedBroadcastFixt
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMaxQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, 20) })),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f, 0) })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })))
+                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })),
+                                                                                                                      OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallInPlace, NEElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                       ElementwiseMaxQASYMM8SignedDataset),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, -20) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, -20) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, -20) })),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-
 TEST_SUITE_END()
 
 TEST_SUITE_END()
@@ -173,7 +190,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxFP16Dataset),
+                                                                                                     InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -182,7 +200,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxFP32Dataset),
+                                                                                                      InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -190,8 +209,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMaxFixture<float>, framework::Data
 template <typename T>
 using NEElementwiseMaxBroadcastFixture = ElementwiseMaxBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMax, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                                                                                                                        ElementwiseMaxFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMaxBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                                                                                                                        ElementwiseMaxFP32Dataset),
+                                                                                                                        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseMaxBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwiseMaxFP32Dataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -200,7 +227,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ElementwiseMax
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseMin.cpp b/tests/validation/NEON/ElementwiseMin.cpp
index 0fc6f5fead..a134eb354d 100644
--- a/tests/validation/NEON/ElementwiseMin.cpp
+++ b/tests/validation/NEON/ElementwiseMin.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,8 @@ const auto ElementwiseMinFP16Dataset = combine(combine(framework::dataset::make(
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseMinFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -110,7 +112,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMinS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseMinS32Dataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -118,7 +121,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int32_t>, framework::Da
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinS16Dataset),
+                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -133,23 +137,34 @@ TEST_SUITE(QASYMM8)
 template <typename T>
 using NEElementwiseMinQuantizedBroadcastFixture = ElementwiseMinQuantizedBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMin, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseMinQASYMM8Dataset),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                               ElementwiseMinQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseMinQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                               ElementwiseMinQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 20) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 20) })),
+                               InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMinQASYMM8Dataset),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) }))
-
-                      )
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                                                                                                                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -157,11 +172,12 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<uint8_t>, fram
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMaxQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(10.f, 20) })),
                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f, 0) })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })))
+                                                                                                                      framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f, -27) })),
+                                                                                                                      OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
@@ -174,7 +190,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinFP16Dataset),
+                                                                                                     InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -183,7 +200,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinFP32Dataset),
+                                                                                                      InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -192,8 +210,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseMinFixture<float>, framework::Data
 template <typename T>
 using NEElementwiseMinBroadcastFixture = ElementwiseMinBroadcastValidationFixture<Tensor, Accessor, NEElementwiseMin, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                                                                                                                        ElementwiseMinFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseMinBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                                                                                                                        ElementwiseMinFP32Dataset),
+                                                                                                                        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseMinBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwiseMinFP32Dataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -202,7 +228,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ElementwiseMin
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseNegation.cpp b/tests/validation/NEON/ElementwiseNegation.cpp
index 7e7c838472..5b8ae8fc64 100644
--- a/tests/validation/NEON/ElementwiseNegation.cpp
+++ b/tests/validation/NEON/ElementwiseNegation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,43 +46,36 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(NegLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NENegLayer neg_layer;
-    neg_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
+template <typename T>
+using NENegLayerFixture = NegValidationInPlaceFixture<Tensor, Accessor, NENegLayer, T>;
 
 template <typename T>
-using NENegLayerFixture = NegValidationFixture<Tensor, Accessor, NENegLayer, T>;
+using NENegLayerQuantizedFixture = NegQuantizedValidationFixture<Tensor, Accessor, NENegLayer, T>;
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                     DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                                                                                                             framework::dataset::make("DataType", DataType::F16)),
+                                                                                                     framework::dataset::make("InPlace", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                   DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                           framework::dataset::make("DataType", DataType::F16)),
+                                                                                                   framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16);
@@ -92,15 +85,17 @@ TEST_SUITE_END() // FP16
 #endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(),
+                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                framework::dataset::make("InPlace", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                    DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                            framework::dataset::make("DataType", DataType::F32)),
+                                                                                                    framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
@@ -110,15 +105,17 @@ TEST_SUITE_END() // Float
 
 TEST_SUITE(Integer)
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<int32_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                  DataType::S32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(),
+                                                                                                          framework::dataset::make("DataType", DataType::S32)),
+                                                                                                  framework::dataset::make("InPlace", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                      DataType::S32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                              framework::dataset::make("DataType", DataType::S32)),
+                                                                                                      framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -126,8 +123,34 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<int32_t>, framework::DatasetM
 TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.075, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // NegLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwisePower.cpp b/tests/validation/NEON/ElementwisePower.cpp
index 3ca39e840a..9ac9eec280 100644
--- a/tests/validation/NEON/ElementwisePower.cpp
+++ b/tests/validation/NEON/ElementwisePower.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,8 @@ const auto              ElementwisePowerFP16Dataset = combine(combine(framework:
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwisePowerFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                  framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -91,7 +93,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwisePowerFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwisePowerFP16Dataset),
+                                                                                                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16, 0.01);
@@ -100,30 +103,16 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    NEElementwisePower power;
-    power.configure(&ref_src1, &ref_src2, &dst);
 
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwisePowerFP32Dataset),
+                                                                                                        InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEElementwisePowerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEElementwisePowerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ElementwisePowerFP32Dataset),
+                                                                                                            InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -132,15 +121,23 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEElementwisePowerFixture<float>, framework::Da
 template <typename T>
 using NEElementwisePowerBroadcastFixture = ElementwisePowerBroadcastValidationFixture<Tensor, Accessor, NEElementwisePower, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapesBroadcast(),
-                       ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwisePowerFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
 }
-
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapesBroadcast(),
-                       ElementwisePowerFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapesBroadcastInplace(),
+                       ElementwisePowerFP32Dataset),
+                       InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwisePowerBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
+                       ElementwisePowerFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -149,7 +146,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ElementwisePower
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseRound.cpp b/tests/validation/NEON/ElementwiseRound.cpp
index 1d8cff6d73..620618cb0b 100644
--- a/tests/validation/NEON/ElementwiseRound.cpp
+++ b/tests/validation/NEON/ElementwiseRound.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -40,31 +40,20 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+} // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(RoundLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NERoundLayer round_layer;
-    round_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NERoundLayerFixture = RoundValidationFixture<Tensor, Accessor, NERoundLayer, T>;
 
+template <typename T>
+using NERoundLayerQuantizedFixture = RoundQuantizedValidationFixture<Tensor, Accessor, NERoundLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -100,8 +89,35 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NERoundLayerFixture<float>, framework::DatasetM
 }
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERoundLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERoundLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.075, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // RoundLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
index 0edc06a83d..80788c893f 100644
--- a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
+++ b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,13 +46,42 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<float> tolerance_fp16(0.01f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
+#else  // #if !defined(__aarch64__)
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); // There is difference of 1, because quantizing in reference uses round policy "TO_NEAREST_UP", where the armv7a neon kernel uses "TO_ZERO"
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
+#endif // #if !defined(__aarch64__)
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(RsqrtLayer)
 
+// Test test cases will execute the function with dynamic-stated shapes
+// Since other elementwise unary operations share the same kernel, this tests are added only here.
+// Also, only FP32 is tested since data type doesn't/shouldn't matter with dynamic shapes.
+TEST_SUITE(DynamicShape)
+TEST_SUITE(FP32)
+
+template <typename T>
+using CpuRsqrtDynamicShapeFixture = RsqrtDynamicShapeValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuRsqrtDynamicShapeFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                          DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // DynamicShape
+
 template <typename T>
 using NERsqrtLayerFixture = RsqrtValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
 
+template <typename T>
+using NERsqrtLayerQuantizedFixture = RsqrtQuantizedValidationFixture<Tensor, Accessor, NERsqrtLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -83,8 +112,34 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NERsqrtLayerFixture<float>, framework::DatasetM
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERsqrtLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(20, 0) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.5, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NERsqrtLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(25, -128) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(0.1, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // RsqrtLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseSin.cpp b/tests/validation/NEON/ElementwiseSin.cpp
index c68d1e5612..9c2d7ae268 100644
--- a/tests/validation/NEON/ElementwiseSin.cpp
+++ b/tests/validation/NEON/ElementwiseSin.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ElementWiseUnaryFixture.h"
+#include "tests/validation/fixtures/ElementwiseUnaryFixture.h"
 
 namespace arm_compute
 {
@@ -46,32 +46,18 @@ AbsoluteTolerance<float> tolerance_fp32(0.00001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 AbsoluteTolerance<float> tolerance_fp16(0.0005f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(0);
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(0);
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(SinLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NESinLayer sin_layer;
-    sin_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NESinLayerFixture = SinValidationFixture<Tensor, Accessor, NESinLayer, T>;
 
+template <typename T>
+using NESinLayerQuantizedFixture = SinQuantizedValidationFixture<Tensor, Accessor, NESinLayer, T>;
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
@@ -108,8 +94,34 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESinLayerFixture<float>, framework::DatasetMod
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NESinLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.2, -3) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(200, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NESinLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(
+                       datasets::SmallShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("InputQInfo", { QuantizationInfo(0.07, 6) })),
+                       framework::dataset::make("OutputQInfo", { QuantizationInfo(123, -7) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // SinLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ElementwiseSquareDiff.cpp b/tests/validation/NEON/ElementwiseSquareDiff.cpp
index cf55c6a66b..9a86b541de 100644
--- a/tests/validation/NEON/ElementwiseSquareDiff.cpp
+++ b/tests/validation/NEON/ElementwiseSquareDiff.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,6 +68,8 @@ const auto ElementwiseSquaredDiffFP16Dataset = combine(combine(framework::datase
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 const auto ElementwiseSquaredDiffFP32Dataset = combine(combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F32)),
                                                        framework::dataset::make("DataType", DataType::F32));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -109,24 +111,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::S32);
-
-    // Create and Configure function
-    NEElementwiseSquaredDiff add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffS32Dataset),
+                                                                                                                      InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -134,24 +120,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, frame
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::S16 })),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::S16);
-
-    // Create and Configure function
-    NEElementwiseSquaredDiff add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset),
+                                                                                                                InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -163,30 +133,12 @@ using NEElementwiseSquaredDiffQuantizedFixture = ElementwiseSquaredDiffValidatio
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::QASYMM8);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::QASYMM8);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::QASYMM8);
-
-    // Create and Configure function
-    NEElementwiseMin add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQASYMM8Dataset),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) }))
-
-                      )
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
@@ -195,11 +147,23 @@ template <typename T>
 using NEElementwiseSquaredDiffQuantizedBroadcastFixture = ElementwiseSquaredDiffQuantizedBroadcastValidationFixture<Tensor, Accessor, NEElementwiseSquaredDiff, T>;
 
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
-                                                       ElementwiseSquaredDiffQASYMM8Dataset),
+                       combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                               ElementwiseSquaredDiffQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEElementwiseSquaredDiffQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                               ElementwiseSquaredDiffQASYMM8Dataset),
+                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                                                framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -207,11 +171,12 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffQuantizedBroad
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQASYMM8SignedDataset),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f, 5) })),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(.5f, 5) })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(.2f, 5) })))
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(.2f, 5) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -222,7 +187,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP16Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP16Dataset),
+                                                                                                             InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16, 0.01);
@@ -231,24 +197,8 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    NEElementwiseSquaredDiff add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset),
+                                                                                                              InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -256,15 +206,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framewo
 template <typename T>
 using NEElementwiseSquaredDiffBroadcastFixture = ElementwiseSquaredDiffBroadcastValidationFixture<Tensor, Accessor, NEElementwiseSquaredDiff, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapesBroadcast(),
-                       ElementwiseSquaredDiffFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapesBroadcast(),
+                       ElementwiseSquaredDiffFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapesBroadcast(),
-                       ElementwiseSquaredDiffFP32Dataset))
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEElementwiseSquaredDiffBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
+                       ElementwiseSquaredDiffFP32Dataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -273,7 +225,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // ElementwiseSquaredDiff
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/EqualizeHistogram.cpp b/tests/validation/NEON/EqualizeHistogram.cpp
deleted file mode 100644
index ad7d71000c..0000000000
--- a/tests/validation/NEON/EqualizeHistogram.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/EqualizeHistogramFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(EqualizeHistogram)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEEqualizeHistogram equalize_histogram;
-    equalize_histogram.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-template <typename T>
-using NEEqualizeHistogramFixture = EqualizeHistogramValidationFixture<Tensor, Accessor, NEEqualizeHistogram, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEEqualizeHistogramFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEEqualizeHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Erode.cpp b/tests/validation/NEON/Erode.cpp
deleted file mode 100644
index 0b87e60508..0000000000
--- a/tests/validation/NEON/Erode.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEErode.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ErodeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Erode)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEErode erode;
-    erode.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEErodeFixture = ErodeValidationFixture<Tensor, Accessor, NEErode, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEErodeFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                             DataType::U8)),
-                                                                                                     datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEErodeFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)),
-                                                                                                   datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/FFT.cpp b/tests/validation/NEON/FFT.cpp
index d762630d30..f7ef0a314e 100644
--- a/tests/validation/NEON/FFT.cpp
+++ b/tests/validation/NEON/FFT.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,30 +72,6 @@ constexpr float          tolerance_num = 0.07f; /**< Tolerance number */
 TEST_SUITE(NEON)
 TEST_SUITE(FFT1D)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(shapes_1d, data_types),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type, 2);
-    Tensor dst = create_tensor<Tensor>(shape, data_type, 2);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEFFT1D fft1d;
-    fft1d.configure(&src, &dst, FFT1DInfo());
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -140,31 +116,6 @@ TEST_SUITE_END() // Float
 TEST_SUITE_END() // FFT1D
 
 TEST_SUITE(FFT2D)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(shapes_2d, data_types),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type, 2);
-    Tensor dst = create_tensor<Tensor>(shape, data_type, 2);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEFFT2D fft2d;
-    fft2d.configure(&src, &dst, FFT2DInfo());
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
@@ -207,6 +158,8 @@ TEST_SUITE(FFTConvolutionLayer)
 
 template <typename T>
 using NEFFTConvolutionLayerFixture = FFTConvolutionValidationFixture<Tensor, Accessor, NEFFTConvolutionLayer, T>;
+template <typename T>
+using NEFFTConvolutionLayerMixedDataLayoutFixture = FFTConvolutionValidationFixture<Tensor, Accessor, NEFFTConvolutionLayer, T, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
@@ -218,11 +171,18 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEFFTConvolutionLayerFixture<float>, framework:
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32, tolerance_num);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFFTConvolutionLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFFTConvolutionLayerDataset(),
+                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                 ActivationFunctionsSmallDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32, tolerance_num);
+}
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // FFTConvolutionLayer
-
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/FastCorners.cpp b/tests/validation/NEON/FastCorners.cpp
deleted file mode 100644
index 4416662fea..0000000000
--- a/tests/validation/NEON/FastCorners.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ImageFileDatasets.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/FastCornersFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Radius of the Bresenham circle around the candidate point */
-const unsigned int bresenham_radius = 3;
-/* Tolerance used to compare corner strengths */
-const AbsoluteTolerance<float> tolerance(0.5f);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(FastCorners)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()),
-                                                                                   framework::dataset::make("Format", Format::U8)),
-                                                                           framework::dataset::make("SuppressNonMax", { false, true })),
-                                                                   framework::dataset::make("BorderMode", BorderMode::UNDEFINED)),
-               shape, format, suppress_nonmax, border_mode)
-{
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    std::uniform_real_distribution<float>  real_dist(0, 255);
-
-    const uint8_t constant_border_value = int_dist(gen);
-    const float   threshold             = real_dist(gen);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type_from_format(format));
-    src.info()->set_format(format);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    KeyPointArray corners;
-
-    // Create and configure function
-    NEFastCorners fast_corners;
-    fast_corners.configure(&src, threshold, suppress_nonmax, &corners, border_mode, constant_border_value);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 1); // elems_processed
-
-    calculator.set_border_size(bresenham_radius);
-    calculator.set_access_offset(-bresenham_radius);
-    calculator.set_accessed_elements(8); // elems_read
-
-    validate(src.info()->padding(), calculator.required_padding());
-}
-
-template <typename T>
-using NEFastCornersFixture = FastCornersValidationFixture<Tensor, Accessor, KeyPointArray, NEFastCorners, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFastCornersFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallImageFiles(), framework::dataset::make("Format", Format::U8)),
-                                                                                                                   framework::dataset::make("SuppressNonMax", { false, true })),
-                                                                                                           framework::dataset::make("BorderMode", BorderMode::UNDEFINED)))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(), array.buffer() + array.num_values(), _reference.begin(), _reference.end(), tolerance);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFastCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeImageFiles(), framework::dataset::make("Format", Format::U8)),
-                                                                                                                 framework::dataset::make("SuppressNonMax", { false, true })),
-                                                                                                         framework::dataset::make("BorderMode", BorderMode::UNDEFINED)))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(), array.buffer() + array.num_values(), _reference.begin(), _reference.end(), tolerance);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Fill.cpp b/tests/validation/NEON/Fill.cpp
index 59cc2cb378..0073f22677 100644
--- a/tests/validation/NEON/Fill.cpp
+++ b/tests/validation/NEON/Fill.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,7 +100,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEFillFixture<uint32_t>, framework::DatasetMode
 }
 TEST_SUITE_END() // U32
 
-
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEFillFixture<int32_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S32)))
 {
@@ -118,7 +117,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEFillFixture<float>, framework::DatasetMode::A
 TEST_SUITE_END() // F32
 
 TEST_SUITE_END() // Fill
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/FillBorder.cpp b/tests/validation/NEON/FillBorder.cpp
index 7e0fb1a433..928990b2b4 100644
--- a/tests/validation/NEON/FillBorder.cpp
+++ b/tests/validation/NEON/FillBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "tests/Globals.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/datasets/BorderModeDataset.h"
@@ -60,10 +60,10 @@ DATA_TEST_CASE(FillBorder, framework::DatasetMode::ALL, combine(combine(combine(
 {
     BorderSize border_size{ static_cast<unsigned int>(size) };
 
-    std::mt19937                           generator(library->seed());
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    const uint8_t                          border_value = distribution_u8(generator);
-    const uint8_t                          tensor_value = distribution_u8(generator);
+    std::mt19937                            generator(library->seed());
+    std::uniform_int_distribution<uint32_t> distribution_u8(0, 255);
+    const uint8_t                           border_value = distribution_u8(generator);
+    const uint8_t                           tensor_value = distribution_u8(generator);
 
     // Create tensors
     Tensor src = create_tensor<Tensor>(shape, data_type);
@@ -77,7 +77,7 @@ DATA_TEST_CASE(FillBorder, framework::DatasetMode::ALL, combine(combine(combine(
     validate(src.info()->padding(), padding);
 
     // Fill tensor with constant value
-    std::uniform_int_distribution<uint8_t> distribution{ tensor_value, tensor_value };
+    std::uniform_int_distribution<uint32_t> distribution{ tensor_value, tensor_value };
     library->fill(Accessor(src), distribution, 0);
 
     // Create and configure kernel
diff --git a/tests/validation/NEON/Flatten.cpp b/tests/validation/NEON/Flatten.cpp
index d4742a757c..4ebd897c7a 100644
--- a/tests/validation/NEON/Flatten.cpp
+++ b/tests/validation/NEON/Flatten.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,7 +100,7 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // FlattenLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Floor.cpp b/tests/validation/NEON/Floor.cpp
index 978b6dbbd6..3cd1033ef9 100644
--- a/tests/validation/NEON/Floor.cpp
+++ b/tests/validation/NEON/Floor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -62,6 +65,30 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
     const Status status = NEFloor::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false));
     ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
 }
+
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuFloorKernel::get_implementation(DataTypeISASelectorData{data_type, cpu_isa}, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_floor";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/NEON/FullyConnectedLayer.cpp b/tests/validation/NEON/FullyConnectedLayer.cpp
index 523b3c62f1..ee7e56227d 100644
--- a/tests/validation/NEON/FullyConnectedLayer.cpp
+++ b/tests/validation/NEON/FullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,8 @@
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuFullyConnected.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/FullyConnectedLayerDataset.h"
@@ -40,6 +42,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -56,7 +59,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -64,18 +67,25 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto FullyConnectedParameters = combine(framework::dataset::make("TransposeWeights", { false, true }), framework::dataset::make("ReshapeWeights", { false, true }));
+const auto FullyConnectedParameters = combine(make("TransposeWeights", { false, true }), make("ReshapeWeights", { false, true }));
 
-const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+const auto QuantizationData = make("QuantizationInfo",
 {
     QuantizationInfo(1.f / 256.f, 10),
     QuantizationInfo(1.1f, 10),
 });
-const auto EmptyActivationFunctionDataset = framework::dataset::make("ActivationInfo",
+
+const auto IgnoredQuantizationData = make("IgnoredQuantizationInfo",
+{
+    QuantizationInfo(),
+});
+
+const auto NoActivationFunctionDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(),
 });
-const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+
+const auto ActivationFunctionsDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
@@ -83,7 +93,7 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH),
 });
 
-const auto ActivationFunctionsQuantizedDataset = framework::dataset::make("ActivationInfo",
+const auto ActivationFunctionsQuantizedDataset = make("ActivationInfo",
 {
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
@@ -94,40 +104,183 @@ const auto ActivationFunctionsQuantizedDataset = framework::dataset::make("Activ
 TEST_SUITE(NEON)
 TEST_SUITE(FullyConnectedLayer)
 
+/** Test case for memory injection in @ref cpu::CpuFullyConnected.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       fc          = std::make_unique<cpu::CpuFullyConnected>();
+    const auto src_info    = TensorInfo(TensorShape(8U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(8U, 4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto fc_info     = FullyConnectedLayerInfo{};
+    fc->configure(&src_info, &weight_info, &bias_info, &dst_info, fc_info);
+
+    // telhs are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(fc->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        // This operator is configured once and captured by this lambda.
+        fc->prepare(prep_pack);
+        fc->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEFullyConnectedLayer.
+ *
+ * Make sure @ref NEFullyConnectedLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       fc          = std::make_unique<NEFullyConnectedLayer>();
+    const auto src_info    = TensorInfo(TensorShape(8U), 1, DataType::F32, DataLayout::NHWC);
+    const auto weight_info = TensorInfo(TensorShape(8U, 4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto bias_info   = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    auto       dst_info    = TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC);
+    const auto fc_info     = FullyConnectedLayerInfo{};
+    auto       run_conv    = [&]()
+    {
+        auto src    = create_tensor<Tensor>(src_info);
+        auto weight = create_tensor<Tensor>(weight_info);
+        auto bias   = create_tensor<Tensor>(bias_info);
+        auto dst    = create_tensor<Tensor>(dst_info);
+        fc->configure(&src, &weight, &bias, &dst, fc_info);
+        src.allocator()->allocate();
+        weight.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(src), 1.f);
+        library->fill_tensor_value(Accessor(weight), 2.f);
+        library->fill_tensor_value(Accessor(bias), 3.f);
+        fc->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Unit test for @ref cpu::CpuFullyConnected with quantized multipler > 1
+ *
+ * Tests output correctness.
+ */
+TEST_CASE(Quant8_Signed_Mult_gt_1, framework::DatasetMode::ALL)
+{
+    auto       fc          = std::make_unique<cpu::CpuFullyConnected>();
+    const auto src_info    = TensorInfo(TensorShape(1U, 3U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(0.5f, -1));
+    const auto weight_info = TensorInfo(TensorShape(1U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(0.5, -8));
+    const auto bias_info   = TensorInfo(TensorShape(1U), 1, DataType::S32);
+    auto       dst_info    = TensorInfo(TensorShape(1U, 3U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(0.1f, 0));
+    const auto fc_info     = FullyConnectedLayerInfo{};
+    fc->configure(&src_info, &weight_info, &bias_info, &dst_info, fc_info);
+
+    // telhs are newly created every call of this lambda function
+    auto src    = create_tensor<Tensor>(src_info);
+    auto weight = create_tensor<Tensor>(weight_info);
+    auto bias   = create_tensor<Tensor>(bias_info);
+    auto dst    = create_tensor<Tensor>(dst_info);
+    src.allocator()->allocate();
+    weight.allocator()->allocate();
+    bias.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias }, { TensorType::ACL_DST, &dst } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(fc->workspace(), mg, run_pack, prep_pack);
+
+    // Initialize input values
+    const std::vector<int8_t>  src_values    = { 3, 63, 31 };
+    const std::vector<int8_t>  weight_values = { -4 };
+    const std::vector<int32_t> bias_values   = { 16 };
+    const std::vector<int32_t> expected      = { 80, 127, 127 };
+    library->fill_static_values(Accessor(src), src_values);
+    library->fill_static_values(Accessor(weight), weight_values);
+    library->fill_static_values(Accessor(bias), bias_values);
+
+    // Run FC layer
+    fc->prepare(prep_pack);
+    fc->run(run_pack);
+
+    auto dst_ptr = reinterpret_cast<int8_t *>(dst.buffer());
+    for(size_t i = 0; i < dst.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(dst_ptr[i] == expected[i], framework::LogLevel::ERRORS);
+    }
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
+    make("InputInfo", { TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Mismatching data types
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Invalid weights dimensions
                                             TensorInfo(TensorShape(9U, 5U, 7U, 3U), 1, DataType::F32),    // Wrongly reshaped weights
                                             TensorInfo(TensorShape(8U, 4U, 6U, 4U), 1, DataType::F32),
                                           }),
-    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
+    make("WeightsInfo",{ TensorInfo(TensorShape(315U, 271U), 1, DataType::F16),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                              TensorInfo(TensorShape(217U, 315U), 1, DataType::F32),
                                              TensorInfo(TensorShape(192U, 192U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
+    make("BiasInfo",{ TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(271U), 1, DataType::F32),
                                           TensorInfo(TensorShape(192U), 1, DataType::F32),
                                           })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
+    make("OutputInfo",{ TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                                             TensorInfo(TensorShape(192U, 4U), 1, DataType::F32),
                                            })),
-    framework::dataset::make("TransposeWeights",{ true, true, false, true, true, true })),
-    framework::dataset::make("ReshapedWeights",{ false, false, false, false, false , false})),
-    framework::dataset::make("Expected", { false, true, true, false, false, true })),
+    make("TransposeWeights",{ true, true, false, true, true, true })),
+    make("ReshapedWeights",{ false, false, false, false, false , false})),
+    make("Expected", { false, true, true, false, false, true })),
     input_info, weights_info, bias_info, output_info, transpose_weights, reshaped_weights, expected)
 {
     // Create Fully Connected layer info
@@ -143,130 +296,251 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
 
 template <typename T>
 using NEFullyConnectedLayerFixture = FullyConnectedLayerValidationFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
+template <typename T>
+using NEFullyConnectedLayerMixedDataLayoutFixture = FullyConnectedLayerValidationFixture<Tensor, Accessor, NEFullyConnectedLayer, T, true>;
+template <typename T>
+using NEFullyConnectedLayerDynamicWeightsFixture = FullyConnectedWithDynamicWeightsFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
+template <typename T>
+using NEFullyConnectedLayerDynamicBiasFixture = FullyConnectedWithDynamicBiasFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                                                                                                                        FullyConnectedParameters),
-                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                        FullyConnectedParameters,
+                                                                                                                        make("DataType", DataType::F16),
+                                                                                                                NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::F16)),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::F16),
                        ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(),
-                                                                                                                      FullyConnectedParameters),
-                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
-                                                                                                              EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(),
+                                                                                                                      FullyConnectedParameters,
+                                                                                                                      make("DataType", DataType::F16),
+                                                                                                              NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num_f16, abs_tolerance_f16);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F16),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false, true })))
+{
+}
 TEST_SUITE_END()
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                 framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                 EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                 make("DataType", DataType::F32),
+                                                                                                                 NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(
-                           combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(
+                           make("Input", TensorShape(9U, 5U, 7U)),
+                           make("Weights", TensorShape(315U, 271U)),
+                       make("Biases", TensorShape(271U)),
+                       make("Output", TensorShape(271U)),
+                       FullyConnectedParameters,
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::FullyConnectedLayerWithActivationDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::F32),
                        ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters),
-                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
-                                                                                                               EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeFullyConnectedLayerDataset(), FullyConnectedParameters,
+                                                                                                                       make("DataType", DataType::F32),
+                                                                                                               NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, abs_tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::F32),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false, true })))
+{
+}
 TEST_SUITE_END()
 TEST_SUITE_END()
 
 template <typename T>
 using NEFullyConnectedLayerQuantizedFixture = FullyConnectedLayerValidationQuantizedFixture<Tensor, Accessor, NEFullyConnectedLayer, T>;
+template <typename T>
+using NEFullyConnectedLayerQuantizedMixedDataLayoutFixture = FullyConnectedLayerValidationQuantizedFixture<Tensor, Accessor, NEFullyConnectedLayer, T, true>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
-                           combine(datasets::SmallFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunSmallWithActivation, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8),
+                       QuantizationData,
                        ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunDynamicWeightsWithActivation, NEFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false })))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicBiasWithActivation, NEFullyConnectedLayerDynamicBiasFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
+{
+}
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(
-                           combine(datasets::LargeFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+// Dynamic Quantization Tests here
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                           combine(datasets::SmallFullyConnectedLayerDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8),
+                       IgnoredQuantizationData,
+                       NoActivationFunctionDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END()
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(
+                           datasets::LargeFullyConnectedLayerDataset(),
+                            FullyConnectedParameters,
+                           framework::dataset::make("DataType", DataType::QASYMM8),
+                       QuantizationData,
+                       NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicBias, NEFullyConnectedLayerDynamicBiasFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       NoActivationFunctionDataset))
+{
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8),
+                                       IgnoredQuantizationData,
+                               NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8),
+                       NoActivationFunctionDataset,
+                       make("WeightsReshaped", { false })))
+{
+}
+TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
-                           combine(datasets::SmallFullyConnectedLayerDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       QuantizationData),
-                       EmptyActivationFunctionDataset))
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayoutWithActivation, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-
-FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunWithActivation, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
                            combine(datasets::FullyConnectedLayerWithActivationDataset(),
-                                   FullyConnectedParameters),
-                           framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                       QuantizationData),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8_SIGNED),
+                       QuantizationData,
                        ActivationFunctionsQuantizedDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+FIXTURE_DATA_TEST_CASE(RunDynamicWeightsWithActivation, NEFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)),
+                       make("WeightsReshaped", { false })))
+{
+}
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+// Dynamic Quantization tests
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(
+                           datasets::SmallFullyConnectedLayerDataset(),
+                                   FullyConnectedParameters,
+                           make("DataType", DataType::QASYMM8_SIGNED),
+                       IgnoredQuantizationData,
+                       NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEFullyConnectedLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                                                                        combine(
+                                                                           make("Input", TensorShape(9U, 5U, 7U)),
+                                                                           make("Weights", TensorShape(315U, 271U)),
+                                                                       make("Biases", TensorShape(271U)),
+                                                               make("Output", TensorShape(271U)),
+                                                       FullyConnectedParameters,
+                                               make("DataType", DataType::QASYMM8_SIGNED),
+                                       QuantizationData,
+                               NoActivationFunctionDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunDynamicWeights, NEFullyConnectedLayerDynamicWeightsFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallFullyConnectedLayerDataset(),
+                       make("DataType", DataType::QASYMM8_SIGNED),
+                       NoActivationFunctionDataset,
+                       make("WeightsReshaped", { false })))
+{
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // FullyConnectedLayer
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/FuseBatchNormalization.cpp b/tests/validation/NEON/FuseBatchNormalization.cpp
index 1a8f928c6e..62265c6ac9 100644
--- a/tests/validation/NEON/FuseBatchNormalization.cpp
+++ b/tests/validation/NEON/FuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -214,7 +214,7 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // DepthwiseConvolution
 TEST_SUITE_END() // FuseBatchNormalization
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index 2bcdf8a7ff..5f6a402204 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
+#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
+#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
+#include "src/cpu/operators/CpuGemm.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -47,6 +51,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace
 {
 constexpr AbsoluteTolerance<float> tolerance_f(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */
@@ -56,7 +62,7 @@ const AbsoluteTolerance<float>      abs_tolerance_f16(0.2f);      /**< Absolute
 constexpr float                     tolerance_num = 0.07f;        /**< Tolerance number for FP16 data types */
 #endif                                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -64,103 +70,412 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12);
-const auto data_transpose  = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);
+const auto data_interleave = make("M", 8, 12) * make("N", 8, 12);
+const auto data_transpose  = make("M", 8, 14) * make("N", 7, 14);
+
+/** Zero padding test */
+template <typename FunctionType>
+bool validate_zero_padding(unsigned int dim0_value, unsigned int dim1_value)
+{
+    const TensorShape in_shape(dim0_value, dim1_value);
+    TensorInfo        in(in_shape, 1, DataType::U32);
+    TensorInfo        dst;
+
+    ARM_COMPUTE_EXPECT(in.is_resizable(), framework::LogLevel::ERRORS);
 
+    // Validate zero-padding
+    FunctionType func;
+
+    func.configure(&in, &dst);
+
+    return in.padding().empty();
+}
+
+/* Zero padding test for GEMM kernels */
+bool validate_gemm_zero_padding(const TensorShape shape0, const TensorShape shape1)
+{
+    // Create tensors
+    TensorInfo in0(shape0, 1, DataType::F32);
+    TensorInfo in1(shape1, 1, DataType::F32);
+    TensorInfo dst;
+
+    // Validate zero-padding
+    cpu::kernels::CpuGemmMatrixMultiplyKernel gemm;
+    gemm.configure(&in0, &in1, &dst, 1.0, false);
+
+    return in0.padding().empty() && in1.padding().empty() && dst.padding().empty();
+}
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(GEMM)
 
+/** Test case for memory injection in @ref cpu::CpuGemm.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       gemm      = std::make_unique<cpu::CpuGemm>();
+    const auto lhs_info  = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);
+    const auto rhs_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto c_info    = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    auto       dst_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto gemm_info = GEMMInfo{};
+    gemm->configure(&lhs_info, &rhs_info, &c_info, &dst_info, 1.f, 1.f, gemm_info);
+
+    // telhs are newly created every call of this lambda function
+    auto lhs = create_tensor<Tensor>(lhs_info);
+    auto rhs = create_tensor<Tensor>(rhs_info);
+    auto c   = create_tensor<Tensor>(c_info);
+    lhs.allocator()->allocate();
+    rhs.allocator()->allocate();
+    c.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &lhs }, { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
+    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
+
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(lhs), 1.f);
+        library->fill_tensor_value(Accessor(rhs), 2.f);
+        library->fill_tensor_value(Accessor(c), 3.f);
+        // This operator is configured once and captured by this lambda.
+        gemm->prepare(prep_pack);
+        gemm->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+/** Test case for memory injection in @ref NEGEMM.
+ *
+ * Make sure @ref NEGEMM still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto       gemm      = std::make_unique<NEGEMM>();
+    const auto lhs_info  = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);
+    const auto rhs_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto c_info    = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    auto       dst_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
+    const auto gemm_info = GEMMInfo{};
+    auto       run_conv  = [&]()
+    {
+        auto lhs = create_tensor<Tensor>(lhs_info);
+        auto rhs = create_tensor<Tensor>(rhs_info);
+        auto c   = create_tensor<Tensor>(c_info);
+        auto dst = create_tensor<Tensor>(dst_info);
+        gemm->configure(&lhs, &rhs, &c, &dst, 1.f, 1.f, gemm_info);
+        lhs.allocator()->allocate();
+        rhs.allocator()->allocate();
+        c.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(lhs), 1.f);
+        library->fill_tensor_value(Accessor(rhs), 2.f);
+        library->fill_tensor_value(Accessor(c), 3.f);
+        gemm->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+               make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type
+                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),
+                                                     }),
+               make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32),
+                                                        TensorInfo(TensorShape(8U, 27U), 1, DataType::F32),
+                                                     })),
+               make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32),
+                                                        TensorInfo(TensorShape(8U, 13U), 1, DataType::F32),
+                                                     })),
+               make("Expected", { false, true })),
+               lhs_info, rhs_info, output_info, expected)
+{
+    constexpr float alpha = 1.0;
+    constexpr float beta = 0.0;
+    const auto gemm_info = GEMMInfo();
+    bool is_valid = bool(NEGEMM::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), alpha, beta, gemm_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE(KERNEL_SELECTION)
+DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL,
+               combine(make("CpuExt", std::string("NEON")),
+                       make("DataType", { DataType::F32,
+                                                              DataType::F16
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl_mul = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_mul);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_mul";
+    std::string actual   = selected_impl_mul->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+
+    const auto *selected_impl_add = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_add);
+
+    expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_add";
+    actual   = selected_impl_add->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // KERNEL_SELECTION
+
 TEST_SUITE(TRANSPOSE_1XW)
-using NEGEMMTranspose1xW        = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMTranspose1xWKernel, 4>;
-using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, float>;
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::F32))
+using CpuGemmTranspose1xW = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmTranspose1xWKernel>;
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
+                   make("N", { 1, 23, 63, 101 }),
+                   make("K", { 1, 47, 29, 27 })),
+               n_value, k_value)
+{
+    bool status = validate_zero_padding<CpuGemmTranspose1xW>(n_value, k_value);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE(U32)
+using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // FP32
+TEST_SUITE_END() // U32
+
+TEST_SUITE(U16)
+using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U16))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U8))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
 
 TEST_SUITE_END() // TRANSPOSE_1XW
 
 TEST_SUITE(INTERLEAVE_4X4)
-using NEGEMMInterleave4x4 = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMInterleave4x4Kernel, 4>;
+using CpuGemmInterleave4x4 = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmInterleave4x4Kernel>;
 
-TEST_SUITE(FP32)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, float>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::F32))
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
+                   make("M", { 1, 23, 63, 101 }),
+                   make("K", { 1, 47, 29, 27 })),
+               m_value, k_value)
+{
+    bool status = validate_zero_padding<cpu::kernels::CpuGemmInterleave4x4Kernel>(m_value, k_value);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE(U32)
+using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // FP32
+TEST_SUITE_END() // U32
 
-TEST_SUITE_END() // INTERLEAVE_4X4
+TEST_SUITE(U16)
+using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U16))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::QASYMM8))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
 
-//TODO(COMPMID-415): Validate valid region
+TEST_SUITE_END() // INTERLEAVE_4X4
 
 template <typename T>
 using NEGEMMFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T>;
 
 template <typename T>
-using NEGEMMFixtureDisabledC = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true>;
+using NEBatchedMatMulFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true, false, false, false, false, true>;
+
+template <typename T>
+using NEGEMMAccumulateFixture = GEMMAccumulateValidationFixture<Tensor, Accessor, NEGEMM, T>;
 
 TEST_SUITE(Float)
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(make("In0", { TensorShape(21U, 13U),
+                                                                                                       TensorShape(31U, 1U),
+                                                                                                       TensorShape(31U, 1U),
+                                                                                                       TensorShape(8U, 2U),
+                                                                                                       TensorShape(38U, 12U),
+                                                                                                       TensorShape(32U, 1U)
+                                                                                                     }),
+                                                                     make("In1", { TensorShape(33U, 21U),
+                                                                                                       TensorShape(23U, 31U),
+                                                                                                       TensorShape(23U, 31U),
+                                                                                                       TensorShape(16U, 8U),
+                                                                                                       TensorShape(21U, 38U),
+                                                                                                       TensorShape(17U, 32U)
+                                                                                                     })),
+               shape0, shape1)
+{
+    bool status = validate_gemm_zero_padding(shape0, shape1);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
+                                                                     zip(make("In0",{ TensorShape(21U, 13U) }),
+                                                                     make("In1", { TensorShape(33U, 21U) }),
+                                                                     make("Dst", { TensorShape(33U, 13U) })),
+                                                                     zip(
+                                                                     make("alpha", { 1.0, 100.0, 1.0, 1.0 }),
+                                                                     make("beta", { 0.0, 0.0, 1.0, 1.0 }),
+                                                                     make("is_c_null", { false, false, false, true }),
+                                                                     make("Expected", { true, false, false, true }))),
+               shape_a, shape_b, shape_dst, alpha, beta, is_c_null, expected)
+{
+    /* Accumulation test for GEMM kernels */
+    // Create tensors
+    TensorInfo in_a(shape_a, 1, DataType::F32);
+    TensorInfo in_b(shape_b, 1, DataType::F32);
+    TensorInfo in_c(shape_dst, 1, DataType::F32);
+    TensorInfo dst(shape_dst, 1, DataType::F32);
+
+    GEMMInfo gemm_info = GEMMInfo();
+    gemm_info.set_accumulate(true);
+
+    // Validate accumulation
+    cpu::CpuGemm gemm;
+    Status status = gemm.validate(&in_a, &in_b, (is_c_null ? nullptr : &in_c), &dst, alpha, beta, gemm_info);
+    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                         framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                 framework::dataset::make("DataType", DataType::F16)))
+                                                                                                         make("ReshapeWeights", { true, false })),
+                                                                                                 make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
-                                                                                                       framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                       make("ReshapeWeights", { true, false })),
+                                                                                               make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+}
 
-                                                                                               framework::dataset::make("DataType", DataType::F16)))
+TEST_SUITE(BATCHED_MATMUL)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
+                                                                                                                  make("ReshapeWeights", { false })),
+                                                                                                          make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // BATCHED_MATMUL
+
+TEST_SUITE_END() // FP16
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                          framework::dataset::make("ReshapeWeights", { true, false })),
-
-                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+                                                                                                          make("ReshapeWeights", { true, false })),
+                                                                                                  make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
-                                                                                                        framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                        make("ReshapeWeights", { true, false })),
+                                                                                                make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f);
+}
 
-                                                                                                framework::dataset::make("DataType", DataType::F32)))
+TEST_SUITE(BATCHED_MATMUL)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
+                                                                                                                   make("ReshapeWeights", { false })),
+                                                                                                           make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-TEST_SUITE(DisabledC)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixtureDisabledC<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true, false })),
+TEST_SUITE_END() // BATCHED_MATMUL
 
-                                                                                                           framework::dataset::make("DataType", DataType::F32)))
+TEST_SUITE(ACCUMULATE)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAccumulateFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallAccumulateGEMMDataset(),
+                                                                                                        make("ReshapeWeights", { false }),
+                                                                                                        make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMAccumulateFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeAccumulateGEMMDataset(),
+                                                                                                        make("ReshapeWeights", { false }),
+                                                                                                        make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // ACCUMULATE
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // GEMM
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index c3747ddd24..d25f43a330 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,11 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -38,7 +39,6 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/GEMMLowpAssemblyFixture.h"
 #include "tests/validation/fixtures/GEMMLowpFixture.h"
 
 namespace arm_compute
@@ -47,37 +47,26 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace
 {
-const auto data_matrix_multiply = framework::dataset::make("M", 12, 20) * framework::dataset::make("N", 12, 20) * framework::dataset::make("K", 16);
+    constexpr AbsoluteTolerance<float> tolerance_batched(1);
+    constexpr AbsoluteTolerance<float> tolerance_quant(1);
 } // namespace
 
-TEST_SUITE(NEON)
-TEST_SUITE(ASSEMBLY_MATRIX_MULTIPLY)
-
-using NEGEMMAssemblyFixture_S8 = GEMMLowpAssemblyFixture<Tensor, Accessor, NEGEMMLowpAssemblyMatrixMultiplyCore, int8_t>;
-using NEGEMMAssemblyFixture_U8 = GEMMLowpAssemblyFixture<Tensor, Accessor, NEGEMMLowpAssemblyMatrixMultiplyCore, uint8_t>;
-
-TEST_SUITE(S8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_S8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_U8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
 
+TEST_SUITE(NEON)
 TEST_SUITE(GEMMLowp)
 TEST_SUITE(MatrixMultiplyCore)
+
 using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpMatrixMultiplyCoreAccumulateFixture = GEMMLowpMatrixMultiplyAccumulateValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpBatchedMatMulFixture      = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, true>;
+using NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture = GEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpDequantizedMatrixMultiplyValidationFixture = GEMMLowpDequantizedMatrixMultiplyValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+
+using framework::dataset::make;
 
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
                shape_a, shape_b, shape_c, a_offset, b_offset)
@@ -97,30 +86,75 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::c
     // Create and configure function
     NEGEMMLowpMatrixMultiplyCore gemmlowp_mm;
     gemmlowp_mm.configure(&a, &b, nullptr, &c);
+
+    // Validate padding is zero
+    validate(a.info()->padding(), PaddingSize());
+    validate(b.info()->padding(), PaddingSize());
+    validate(c.info()->padding(), PaddingSize());
+}
+// accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __aarch64__
+DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("In0",{ TensorShape(21U, 1U) }),
+                                                                     make("In1", { TensorShape(1U, 21U) }),
+                                                                     make("Dst", { TensorShape(1U, 1U) }),
+                                                                     make("a_offset", { -2 }),
+                                                                     make("a_offset", { 13 })
+                                                                    ),
+                                                                    zip(
+                                                                     make("OutputDataType", {  DataType::S32,  DataType::QASYMM8, DataType::QASYMM8_SIGNED}),
+                                                                     make("Expected", { true, false, false })
+                                                                    )),
+               shape_a, shape_b, shape_dst, a_offset, b_offset, output_data_type, expected)
+{
+    DataType input_data_type = (output_data_type == DataType::S32 ? DataType::QASYMM8 : output_data_type);
+    // Accumulation test for GEMM kernels
+    TensorInfo a(shape_a, 1, input_data_type, QuantizationInfo(1.0f / 255, a_offset));
+    TensorInfo b(shape_b, 1, input_data_type, QuantizationInfo(1.0f / 255, b_offset));
+    TensorInfo dst(shape_dst, 1, output_data_type, QuantizationInfo());
+
+    // Create and configure function
+    GEMMInfo gemm_info = GEMMInfo();
+    gemm_info.set_accumulate(true);
+
+    if (is_data_type_quantized(output_data_type))
+    {
+        GEMMLowpOutputStageInfo gemmLowpOutputStageInfo = GEMMLowpOutputStageInfo();
+        gemmLowpOutputStageInfo.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+
+        gemm_info.set_gemmlowp_output_stage(gemmLowpOutputStageInfo);
+    }
+
+    cpu::CpuGemmLowpMatrixMultiplyCore gemmlowp_mm;
+    Status status = gemmlowp_mm.validate(&a, &b, nullptr, &dst, gemm_info);
+
+    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
 }
+#endif // __arch64__
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
                                              TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),                                 // Mismatching data type
                                              TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
                                              TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
                                              TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)),
                                           }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+    make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
                                             TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                          }),
+    make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
                                             TensorInfo(TensorShape(8U, 11U), 1, DataType::S32),
                                             TensorInfo(TensorShape(64U, 32U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Expected", { false, false, false, false, true })),
+                                           }),
+    make("Expected", { true, false, false, false, true })),
     a_info, b_info, output_info, expected)
 {
     // Lock tensors
@@ -133,545 +167,222 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
+/** Test case for memory injection in @ref cpu::CpuGemmLowpMatrixMultiplyCore.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto gemm     = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+    auto a_info   = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);
+    auto b_info   = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);
+    auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);
+    a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));
+    b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));
+    const auto gemm_info = GEMMInfo{};
+    gemm->configure(&a_info, &b_info, nullptr, &dst_info, gemm_info);
+
+    // telhs are newly created every call of this lambda function
+    auto a   = create_tensor<Tensor>(a_info);
+    auto b   = create_tensor<Tensor>(b_info);
+    auto dst = create_tensor<Tensor>(dst_info);
+    a.allocator()->allocate();
+    b.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    ITensorPack run_pack =
+    {
+        { TensorType::ACL_SRC_0, &a },
+        { TensorType::ACL_SRC_1, &b },
+        { TensorType::ACL_DST, &dst }
+    };
+    ITensorPack prep_pack =
+    {
+        { TensorType::ACL_SRC_1, &b },
+    };
 
-using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
-TEST_SUITE(FusedOffsetOutput)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
-                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
+    auto run_conv = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));
+        library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));
+        // This operator is configured once and captured by this lambda.
+        gemm->prepare(prep_pack);
+        gemm->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
 }
-TEST_SUITE_END() // FusedOffsetOutput
-TEST_SUITE_END() // MatrixMultiplyCore
-
-TEST_SUITE(OutputStage)
-
-TEST_SUITE(QuantizeDownInt32Scale)
-
-TEST_SUITE(QASYMM8)
-
-const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                      3)
-                                                      * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                           2)
-                                                           * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpOutputStage>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        0,
-                                            13,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
 
-    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-    output_stage.type        = GEMMLowpOutputStageType::QUANTIZE_DOWN;
-    output_stage.gemmlowp_min_bound        = min;
-    output_stage.gemmlowp_max_bound        = max;
-    output_stage.output_data_type = DataType::QASYMM8;
-
-    // Lock tensors
-    Status status =  NEGEMMLowpOutputStage::validate(&a_info.clone()->set_is_resizable(false),
-                                                                     &b_info.clone()->set_is_resizable(false),
-                                                                     &output_info.clone()->set_is_resizable(false),
-                                                                     output_stage);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+/** Test case for memory injection in @ref NEGEMMLowpMatrixMultiplyCore.
+ *
+ * Make sure @ref NEGEMMLowpMatrixMultiplyCore still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+    auto gemm     = std::make_unique<NEGEMMLowpMatrixMultiplyCore>();
+    auto a_info   = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);
+    auto b_info   = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);
+    auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);
+    a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));
+    b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));
+    const auto gemm_info = GEMMInfo{};
+    auto       run_conv  = [&]()
+    {
+        auto a   = create_tensor<Tensor>(a_info);
+        auto b   = create_tensor<Tensor>(b_info);
+        auto dst = create_tensor<Tensor>(dst_info);
+        gemm->configure(&a, &b, nullptr, &dst, gemm_info);
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        dst.allocator()->allocate();
+        library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));
+        library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));
+        gemm->run();
+        return dst;
+    };
+    auto result_0 = run_conv();
+    auto result_1 = run_conv();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
 }
-// clang-format on
-// *INDENT-ON*
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-TEST_SUITE_END() // BoundedReLu
-
+TEST_SUITE(BatchedMatMul)
+TEST_SUITE(QASYMM8)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
+{
+    validate(Accessor(_target), _reference, tolerance_batched);
+}
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-
-const auto quantize_down_int32_to_int8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
-                                                     3)
-                                                     * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
-                                                          2)
-                                                          * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", -100, -98) * framework::dataset::make("max", 71, 74) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ScaleFixture = GEMMLowpQuantizeDownInt32ToInt8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpOutputStage>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Invalid min and max
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-                                            TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        -10,
-                                            -200,
-                                            -113,
-                                           })),
-    framework::dataset::make("Max",{        105,
-                                            300,
-                                            -18,
-                                           })),
-    framework::dataset::make("Expected", { true, false, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
-    output_stage.type        = GEMMLowpOutputStageType::QUANTIZE_DOWN;
-    output_stage.gemmlowp_min_bound        = min;
-    output_stage.gemmlowp_max_bound        = max;
-    output_stage.output_data_type = DataType::QASYMM8_SIGNED;
-
-    // Lock tensors
-    Status status =  NEGEMMLowpOutputStage::validate(&a_info.clone()->set_is_resizable(false),
-                                                                     &b_info.clone()->set_is_resizable(false),
-                                                                     &output_info.clone()->set_is_resizable(false),
-                                                                     output_stage);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
+        make("DataType", { DataType::QASYMM8_SIGNED }),
+        make("reshape_b_only_on_first_run", { false })))
+{
+    validate(Accessor(_target), _reference, tolerance_batched);
 }
-// clang-format on
-// *INDENT-ON*
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // BatchedMatMul
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_cases))
+TEST_SUITE(FusedOffsetOutput)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_quant);
 }
-
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY,
+    combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END() // BoundedReLu
-
-TEST_SUITE_END() // QASYMM8_SIGNED
-
-TEST_SUITE_END() // QuantizeDownInt32Scale
-
-TEST_SUITE(QuantizeDownInt32ToUint8ScaleByFixedPoint)
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint>;
-
-using NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        0,
-                                            13,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
-                                                                                 &b_info.clone()->set_is_resizable(false),
-                                                                                 &output_info.clone()->set_is_resizable(false),
-                                                                                 min,
-                                                                                 max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                                                                   quantize_down_int32_to_uint8_scale_by_fixedpoint_cases),
-               shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias)
-{
-    TensorShape shape_bias(shape[0]);
-
-    // Create tensors
-    Tensor in   = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor bias = create_tensor<Tensor>(shape_bias, DataType::S32);
-    Tensor out  = create_tensor<Tensor>(shape, DataType::QASYMM8);
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint output_stage;
-    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-
-    // Validate valid region input and output
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(in.info()->valid_region(), valid_region);
-    validate(out.info()->valid_region(), valid_region);
-
-    // Validate valid region bias
-    if(add_bias)
-    {
-        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
-        validate(bias.info()->valid_region(), valid_region_bias);
-    }
-
-    // Validate padding
-    const PaddingSize padding(0);
-    validate(in.info()->padding(), padding);
-    validate(out.info()->padding(), padding);
-
-    if(add_bias)
-    {
-        validate(bias.info()->padding(), padding);
-    }
+    validate(Accessor(_target), _reference, tolerance_quant);
 }
+TEST_SUITE_END() // FusedOffsetOutput
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
+// accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __aarch64__
+TEST_SUITE(ACCUMULATION)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreAccumulateFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreAccumulateFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
+TEST_SUITE_END() // S32
+TEST_SUITE_END() // ACCUMULATION
+#endif // __arch64__
 
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
+TEST_SUITE(DynamicQuantization)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(),
-                       quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // BoundedReLu
-
-TEST_SUITE_END() // QuantizeDownInt32ToUint8ScaleByFixedPoint
-
-TEST_SUITE(QuantizeDownInt32ToInt8ScaleByFixedPoint)
-
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                   2)
-                                                                   * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128) * framework::dataset::make("max", 128) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                        2)
-                                                                        * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-        framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::F32), // Invalid input data type
-                                                 TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                                 TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),
-        }),
-        framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                                TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                                TensorInfo(TensorShape(21U), 1, DataType::S32),
-        })),
-        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-                                                TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                                TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8_SIGNED),
-        })),
-        framework::dataset::make("Min",{ -110,
-                                         -113,
-                                         -113,
-        })),
-        framework::dataset::make("Max",{ 87,
-                                         97,
-                                         97,
-        })),
-        framework::dataset::make("Expected", { false, false, true })),
-               a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
-                                                                                  &b_info.clone()->set_is_resizable(false),
-                                                                                  &output_info.clone()->set_is_resizable(false),
-                                                                                  min,
-                                                                                  max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                                                                   quantize_down_int32_to_int8_scale_by_fixedpoint_cases),
-               shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias)
-{
-    TensorShape shape_bias(shape[0]);
-
-    // Create tensors
-    Tensor in   = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor bias = create_tensor<Tensor>(shape_bias, DataType::S32);
-    Tensor out  = create_tensor<Tensor>(shape, DataType::QASYMM8_SIGNED);
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint output_stage;
-    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-
-    // Validate valid region input and output
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(in.info()->valid_region(), valid_region);
-    validate(out.info()->valid_region(), valid_region);
-
-    // Validate valid region bias
-    if(add_bias)
-    {
-        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
-        validate(bias.info()->valid_region(), valid_region_bias);
-    }
-
-    // Validate padding
-    const PaddingSize padding(0);
-    validate(in.info()->padding(), padding);
-    validate(out.info()->padding(), padding);
-
-    if(add_bias)
-    {
-        validate(bias.info()->padding(), padding);
-    }
-}
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_cases))
+TEST_SUITE_END() // DynamicQuantization
+
+#ifdef __aarch64__
+// Deqaunt tests involve returning F32 from the MatrixMultiplyCore kernels and is only implemented in aarch64
+TEST_SUITE(Dequant)
+constexpr AbsoluteTolerance<float> tolerance_dequantized(0.01f);
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallGEMMLowpDataset(),
+        make("accumulate", {true, false})
+    ))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_dequantized);
 }
 
-TEST_SUITE(BoundedReLu)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeGEMMLowpDataset(),
+        make("accumulate", {false})
+    ))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QuantizeDownInt32ToInt8ScaleByFixedPoint
-
-TEST_SUITE(QuantizeDownInt32ToInt16ScaleByFixedPoint)
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                    2)
-                                                                    * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
-                                                                         2)
-                                                                         * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823,
-                                                                                                        1073741825)
-                                                                               * framework::dataset::make("result_shift", -3,
-                                                                                                          -2)
-                                                                               * framework::dataset::make("min", -32768) * framework::dataset::make("max", 32767) * framework::dataset::make("addBias", { false, true });
-
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600,
-                                                                                                             254601602)
-                                                                                    * framework::dataset::make("result_shift", -3,
-                                                                                                               -1)
-                                                                                    * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-
-using NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
-    GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Input not a multiple of 16
-                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(20U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QSYMM16),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        -205,
-                                            -180,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(false),
-                                                                                 &b_info.clone()->set_is_resizable(false),
-                                                                                 &output_info.clone()->set_is_resizable(false),
-                                                                                 min,
-                                                                                 max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    validate(Accessor(_target), _reference, tolerance_dequantized);
 }
-// clang-format on
-// *INDENT-ON*
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                                                                   quantize_down_int32_to_int16_scale_by_fixedpoint_cases),
-               shape, result_fixedpoint_multiplier, result_shift, min, max, add_bias)
-{
-    TensorShape shape_bias(shape[0]);
-
-    // Create tensors
-    Tensor in   = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor bias = create_tensor<Tensor>(shape_bias, DataType::S32);
-    Tensor out  = create_tensor<Tensor>(shape, DataType::QSYMM16);
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
+TEST_SUITE_END() // Dequant
+#endif // __aarch64__
 
-    // Create and configure function
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint output_stage;
-    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_fixedpoint_multiplier, result_shift, min, max);
-
-    // Validate valid region input and output
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(in.info()->valid_region(), valid_region);
-    validate(out.info()->valid_region(), valid_region);
-
-    // Validate valid region bias
-    if(add_bias)
-    {
-        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
-        validate(bias.info()->valid_region(), valid_region_bias);
-    }
-
-    // Validate padding
-    const PaddingSize padding(0);
-    validate(in.info()->padding(), padding);
-    validate(out.info()->padding(), padding);
-
-    if(add_bias)
-    {
-        validate(bias.info()->padding(), padding);
-    }
-}
-TEST_SUITE(NoRelu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // NoRelu
-TEST_SUITE(BoundedReLu)
-TEST_SUITE(MultSmallerEq1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultSmallerEq1
-TEST_SUITE(MultGreater1)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                       quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // MultGreater1
-TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QuantizeDownInt32ToInt16ScaleByFixedPoint
-TEST_SUITE_END() // OutputStage
+TEST_SUITE_END() // MatrixMultiplyCore
 TEST_SUITE_END() // GEMMLowp
 TEST_SUITE_END() // NEON
 } // namespace validation
diff --git a/tests/validation/NEON/Gather.cpp b/tests/validation/NEON/Gather.cpp
index 6c7faa0032..0aea19939e 100644
--- a/tests/validation/NEON/Gather.cpp
+++ b/tests/validation/NEON/Gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,35 +97,17 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(arm_compute::test::datasets::SmallGatherDataset(), framework::dataset::make("DataType", { DataType::F32 })),
-               input_shape, indices_shape, axis, data_type)
-{
-    const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input_shape.num_dimensions()));
-    Tensor         src         = create_tensor<Tensor>(input_shape, data_type);
-    Tensor         indices     = create_tensor<Tensor>(indices_shape, DataType::U32);
-    TensorShape    dst_shape   = arm_compute::misc::shape_calculator::compute_gather_shape(input_shape, indices_shape, actual_axis);
-    Tensor         dst         = create_tensor<Tensor>(dst_shape, data_type);
-
-    // Create and Configure function
-    NEGather gather;
-    gather.configure(&src, &indices, &dst, axis);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEGatherFixture = GatherFixture<Tensor, Accessor, NEGather, T>;
 
+const auto gather_small_shapes = arm_compute::test::framework::dataset::concat(datasets::SmallGatherDataset(), datasets::SmallGatherMultiDimIndicesDataset());
+
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::F16)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -145,7 +127,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::F32)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -166,7 +148,7 @@ TEST_SUITE(U8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::U8)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::U8)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -186,7 +168,7 @@ TEST_SUITE(U16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEGatherFixture<uint16_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(datasets::SmallGatherDataset(), framework::dataset::make("DataType", DataType::U16)))
+                       combine(gather_small_shapes, framework::dataset::make("DataType", DataType::U16)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -203,7 +185,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 TEST_SUITE_END() // U16
 
 TEST_SUITE_END() // Gather
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Gaussian3x3.cpp b/tests/validation/NEON/Gaussian3x3.cpp
deleted file mode 100644
index 369e768c33..0000000000
--- a/tests/validation/NEON/Gaussian3x3.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Gaussian3x3Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /** Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /** Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Gaussian3x3)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEGaussian3x3 gaussian3x3;
-    gaussian3x3.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEGaussian3x3Fixture = Gaussian3x3ValidationFixture<Tensor, Accessor, NEGaussian3x3, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGaussian3x3Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                           datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGaussian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Gaussian5x5.cpp b/tests/validation/NEON/Gaussian5x5.cpp
deleted file mode 100644
index b748d73e88..0000000000
--- a/tests/validation/NEON/Gaussian5x5.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Gaussian5x5Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 5;              /** Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /** Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Gaussian5x5)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEGaussian5x5 gaussian5x5;
-    gaussian5x5.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 16);
-    calculator.set_border_size(2);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_processed_elements(8);
-    calculator.set_access_offset(-2);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEGaussian5x5Fixture = Gaussian5x5ValidationFixture<Tensor, Accessor, NEGaussian5x5, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGaussian5x5Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                           datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGaussian5x5Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/GaussianPyramid.cpp b/tests/validation/NEON/GaussianPyramid.cpp
deleted file mode 100644
index c646b5011f..0000000000
--- a/tests/validation/NEON/GaussianPyramid.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-*/
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/GaussianPyramidHalfFixture.h"
-#include "tests/validation/reference/Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto small_gaussian_pyramid_levels = combine(datasets::Medium2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 4);
-const auto large_gaussian_pyramid_levels = combine(datasets::Large2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 5);
-
-template <typename T>
-inline void validate_gaussian_pyramid(const Pyramid &target, const std::vector<SimpleTensor<T>> &reference, BorderMode border_mode)
-{
-    ValidRegion prev_valid_region = shape_to_valid_region(reference[0].shape());
-
-    for(size_t i = 1; i < reference.size(); ++i)
-    {
-        const ValidRegion valid_region = shape_to_valid_region_gaussian_pyramid_half(reference[i - 1].shape(), prev_valid_region, (border_mode == BorderMode::UNDEFINED));
-
-        // Validate outputs
-        validate(Accessor(*(target.get_pyramid_level(i))), reference[i], valid_region);
-
-        // Keep the valid region for the next level
-        prev_valid_region = valid_region;
-    }
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(GaussianPyramid)
-TEST_SUITE(Half)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, large_gaussian_pyramid_levels,
-               shape, border_mode, num_levels)
-{
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-
-    // Create pyramid
-    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::U8);
-    Pyramid     dst;
-    dst.init(pyramid_info);
-
-    NEGaussianPyramidHalf gaussian_pyramid_half;
-    gaussian_pyramid_half.configure(&src, &dst, border_mode, 0);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
-    {
-        ARM_COMPUTE_EXPECT(dst.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-}
-
-template <typename T>
-using NEGaussianPyramidHalfFixture = GaussianPyramidHalfValidationFixture<Tensor, Accessor, NEGaussianPyramidHalf, T, Pyramid>;
-
-FIXTURE_DATA_TEST_CASE(RunSmallGaussianPyramidHalf, NEGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::ALL, small_gaussian_pyramid_levels)
-{
-    validate_gaussian_pyramid(_target, _reference, _border_mode);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLargeGaussianPyramidHalf, NEGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::NIGHTLY, large_gaussian_pyramid_levels)
-{
-    validate_gaussian_pyramid(_target, _reference, _border_mode);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/GenerateProposalsLayer.cpp b/tests/validation/NEON/GenerateProposalsLayer.cpp
index dd9c1eff0a..960c2054e7 100644
--- a/tests/validation/NEON/GenerateProposalsLayer.cpp
+++ b/tests/validation/NEON/GenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,13 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h"
 #include "arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
+#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "tests/Globals.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/ArrayAccessor.h"
+#include "tests/NEON/Helper.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
@@ -43,6 +44,8 @@ namespace validation
 {
 namespace
 {
+using NEComputeAllAnchors = NESynthetizeFunction<NEComputeAllAnchorsKernel>;
+
 template <typename U, typename T>
 inline void fill_tensor(U &&tensor, const std::vector<T> &v)
 {
@@ -412,7 +415,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // GenerateProposals
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/GlobalPoolingLayer.cpp b/tests/validation/NEON/GlobalPoolingLayer.cpp
index 769780647b..9cf6a8bb32 100644
--- a/tests/validation/NEON/GlobalPoolingLayer.cpp
+++ b/tests/validation/NEON/GlobalPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/NEON/HOGDescriptor.cpp b/tests/validation/NEON/HOGDescriptor.cpp
deleted file mode 100644
index 29663676c1..0000000000
--- a/tests/validation/NEON/HOGDescriptor.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/HOG.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/HOGDescriptorDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HOGDescriptorFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-RelativeTolerance<float> tolerance(0.1f);
-constexpr float          tolerance_number = 0.05f;
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(HOGDescriptor)
-
-// *INDENT-OFF*
-// clang-format off
-using NEHOGDescriptorFixture = HOGDescriptorValidationFixture<Tensor, HOG, Accessor, NEHOGDescriptor, uint8_t, float>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHOGDescriptorFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(
-                       datasets::SmallHOGDescriptorDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance, tolerance_number);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHOGDescriptorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(
-                       datasets::LargeHOGDescriptorDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance, tolerance_number);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/HOGDetector.cpp b/tests/validation/NEON/HOGDetector.cpp
deleted file mode 100644
index c787728d2c..0000000000
--- a/tests/validation/NEON/HOGDetector.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/NEON/HOGAccessor.h"
-#include "tests/datasets/HOGDescriptorDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HOGDetectorFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Set the tolerance (percentage) used when validating the score of detection window.
-   Note: High tolerance is required due to divergence between CL and NEON detection window scores. */
-RelativeTolerance<float> tolerance(1.0f);
-
-/* Input dataset (values must be a multiple of the HOGInfo block_size) */
-const auto DetectionWindowStrideDataset = framework::dataset::make("DetectionWindowStride", { Size2D(8, 8), Size2D(16, 16) });
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(HOGDetector)
-
-// *INDENT-OFF*
-// clang-format off
-using NEHOGDetectorFixture = HOGDetectorValidationFixture<Tensor,
-                                                          HOG,
-                                                          DetectionWindowArray,
-                                                          NEHOGDescriptor,
-                                                          Accessor,
-                                                          ArrayAccessor<DetectionWindow>,
-                                                          HOGAccessor,
-                                                          NEHOGDetector,
-                                                          uint8_t,
-                                                          float>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHOGDetectorFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(
-                       DetectionWindowStrideDataset,
-                       datasets::SmallHOGDescriptorDataset()),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHOGDetectorFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       DetectionWindowStrideDataset,
-                       datasets::LargeHOGDescriptorDataset()),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/HOGMultiDetection.cpp b/tests/validation/NEON/HOGMultiDetection.cpp
deleted file mode 100644
index d6017e000c..0000000000
--- a/tests/validation/NEON/HOGMultiDetection.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/MultiHOG.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/NEON/HOGAccessor.h"
-#include "tests/datasets/HOGMultiDetectionDataset.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HOGMultiDetectionFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Set the tolerance (percentage) used when validating the strength of detection window. */
-RelativeTolerance<float> tolerance(1.0f);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(HOGMultiDetection)
-
-// *INDENT-OFF*
-// clang-format off
-using NEHOGMultiDetectionFixture = HOGMultiDetectionValidationFixture<Tensor,
-                                                                      HOG,
-                                                                      MultiHOG,
-                                                                      DetectionWindowArray,
-                                                                      Size2DArray,
-                                                                      Accessor,
-                                                                      ArrayAccessor<Size2D>,
-                                                                      ArrayAccessor<DetectionWindow>,
-                                                                      HOGAccessor,
-                                                                      NEHOGMultiDetection,
-                                                                      uint8_t,
-                                                                      float>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHOGMultiDetectionFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(
-                       datasets::SmallHOGMultiDetectionDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
-                       framework::dataset::make("NonMaximaSuppression", {false, true})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHOGMultiDetectionFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::LargeHOGMultiDetectionDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
-                       framework::dataset::make("NonMaximaSuppression", {false, true})))
-{
-    // Validate output
-    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
-}
-
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/HarrisCorners.cpp b/tests/validation/NEON/HarrisCorners.cpp
deleted file mode 100644
index 23621cde4d..0000000000
--- a/tests/validation/NEON/HarrisCorners.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ImageFileDatasets.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HarrisCornersFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/* Allowed percentage of keypoints missing for target */
-const float allowed_missing_percentage = 10.f;
-/* Allowed percentage of keypoints mismatching between target and reference */
-const float allowed_mismatch_percentage = 10.f;
-
-const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }), combine(framework::dataset::make("BlockSize", { 3, 5, 7 }), datasets::BorderModes()));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(HarrisCorners)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), data), framework::dataset::make("Format", Format::U8)), shape,
-               gradient_size, block_size, border_mode, format)
-{
-    std::mt19937                          gen(library->seed());
-    std::uniform_real_distribution<float> real_dist(0.f, 0.01f);
-
-    const float threshold   = real_dist(gen);
-    const float sensitivity = real_dist(gen);
-
-    constexpr float max_euclidean_distance = 30.f;
-    real_dist                              = std::uniform_real_distribution<float>(0.f, max_euclidean_distance);
-    const float min_dist                   = real_dist(gen);
-
-    // Generate a random constant value
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type_from_format(format));
-    src.info()->set_format(format);
-    KeyPointArray corners;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create harris corners configure function
-    NEHarrisCorners harris_corners;
-    harris_corners.configure(&src, threshold, min_dist, sensitivity, gradient_size, block_size, &corners, border_mode, constant_border_value);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(gradient_size / 2);
-    calculator.set_access_offset(-gradient_size / 2);
-    calculator.set_accessed_elements(16);
-
-    const PaddingSize padding = calculator.required_padding();
-
-    validate(src.info()->padding(), padding);
-}
-
-template <typename T>
-using NEHarrisCornersFixture = HarrisCornersValidationFixture<Tensor, Accessor, KeyPointArray, NEHarrisCorners, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHarrisCornersFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallImageFiles(), data), framework::dataset::make("Format",
-                                                                                                             Format::U8)))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end(),
-                       RelativeTolerance<float>(0.0001f),
-                       allowed_missing_percentage,
-                       allowed_mismatch_percentage);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHarrisCornersFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeImageFiles(), data), framework::dataset::make("Format", Format::U8)))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end(),
-                       RelativeTolerance<float>(0.0001f),
-                       allowed_missing_percentage,
-                       allowed_mismatch_percentage);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/HeightConcatenateLayer.cpp b/tests/validation/NEON/HeightConcatenateLayer.cpp
index bfb0a21ab9..c46b797315 100644
--- a/tests/validation/NEON/HeightConcatenateLayer.cpp
+++ b/tests/validation/NEON/HeightConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,7 +71,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {
diff --git a/tests/validation/NEON/Histogram.cpp b/tests/validation/NEON/Histogram.cpp
deleted file mode 100644
index f0beb4ed00..0000000000
--- a/tests/validation/NEON/Histogram.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/Distribution1D.h"
-#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/HistogramFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Histogram)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(),
-                                                                   framework::dataset::make("DataType", DataType::U8)),
-               shape, data_type)
-{
-    // Setup Distribution
-    std::mt19937                            gen(library->seed());
-    std::uniform_int_distribution<size_t>   distribution_size_t(1, 30);
-    const size_t                            num_bins = distribution_size_t(gen);
-    std::uniform_int_distribution<int32_t>  distribution_int32_t(0, 125);
-    const size_t                            offset = distribution_int32_t(gen);
-    std::uniform_int_distribution<uint32_t> distribution_uint32_t(1, 255 - offset);
-    const size_t                            range = distribution_uint32_t(gen);
-    Distribution1D                          distribution_dst(num_bins, offset, range);
-
-    // Create tensors
-    Tensor      src = create_tensor<Tensor>(shape, data_type);
-    TensorShape dst_shape(num_bins);
-    Tensor      dst = create_tensor<Tensor>(dst_shape, DataType::U32);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEHistogram histogram;
-    histogram.configure(&src, &distribution_dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    const ValidRegion valid_region_dst = shape_to_valid_region(dst_shape);
-    validate(dst.info()->valid_region(), valid_region_dst);
-
-    // Validate padding
-    const PaddingSize padding;
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-template <typename T>
-using NEHistogramFixture = HistogramValidationFixture<Tensor, Accessor, NEHistogram, T, Distribution1D>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEHistogramFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                         DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEHistogramFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                       DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp
index f4b2cc7835..ef5e75c5db 100644
--- a/tests/validation/NEON/Im2Col.cpp
+++ b/tests/validation/NEON/Im2Col.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
+#include "src/cpu/kernels/CpuIm2ColKernel.h"
 #include "tests/NEON/Accessor.h"
+#include "tests/NEON/Helper.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -39,6 +40,8 @@ namespace validation
 {
 namespace
 {
+const auto im2col_shapes = framework::dataset::make("Shape", { TensorShape{ 11U, 11U, 11U }, TensorShape{ 16U, 16U, 16U }, TensorShape{ 27U, 13U, 7U }, TensorShape{ 31U, 27U, 17U, 2U }, TensorShape{ 27U, 13U, 5U, 4U }, TensorShape{ 11U, 11U, 5U, 5U } });
+
 const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3U, 3U), Size2D(3U, 1U), Size2D(1U, 5U), Size2D(5U, 5U), Size2D(7U, 7U) });
 const auto conv_args         = combine(combine(combine(combine(conv_filter_sizes, framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) })),
                                                        framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
@@ -54,6 +57,8 @@ const auto conv_args_small         = combine(combine(combine(combine(conv_filter
 TEST_SUITE(NEON)
 TEST_SUITE(Im2Col)
 
+using CpuIm2Col = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuIm2ColKernel>;
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -73,26 +78,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Expected", { false, false, false, false, true })),
                input_info, output_info, has_bias, expected)
 {
-    bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
+    bool status = bool(cpu::kernels::CpuIm2ColKernel::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
 // *INDENT-ON*
 
 template <typename T>
-using NEIm2ColFixture = Im2ColValidationFixture<Tensor, Accessor, NEIm2Col, T, false>;
+using CpuIm2ColFixture = Im2ColOpValidationFixture<Tensor, Accessor, CpuIm2Col, T, false>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
-                                                                                                    conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                     conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType",
-                                                                                                          DataType::F32)),
-                                                                                                  conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                           DataType::F32)),
+                                                                                                   conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -102,15 +107,15 @@ TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
-                                                                                                   conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                    conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType",
-                                                                                                         DataType::F16)),
-                                                                                                 conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                          DataType::F16)),
+                                                                                                  conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -122,15 +127,15 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                      conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                       conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                                                                                            framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                    conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()),
+                                                                                                             framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                     conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -160,8 +165,8 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
     Tensor dst_target = create_tensor<Tensor>(dst_shape, data_type, 1, qinfo);
 
     // Configure target function
-    NEIm2Col im2col_func;
-    im2col_func.configure(&src_target, &dst_target, spatial_kernel, conv_info, has_bias);
+    CpuIm2Col im2col_func;
+    im2col_func.configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias);
 
     // Extend padding
     src_target.info()->extend_padding(PaddingSize(3, 5, 9, 1));
@@ -180,8 +185,13 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
     // Fill target source
     library->fill_tensor_uniform(Accessor(src_target), 0);
 
+    ITensorPack pack =
+    {
+        { TensorType::ACL_SRC, &src_target },
+        { TensorType::ACL_DST, &dst_target }
+    };
     // Run target function
-    im2col_func.run();
+    im2col_func.run(pack);
 
     // Calculate Reference
     SimpleTensor<float> src_ref{ src_shape, data_type, 1, qinfo, data_layout };
@@ -200,7 +210,7 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
 }
 TEST_SUITE_END() // Special Cases
 TEST_SUITE_END() // Im2Col
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/InstanceNormalizationLayer.cpp b/tests/validation/NEON/InstanceNormalizationLayer.cpp
index d2a80f24f7..593ef0ad99 100644
--- a/tests/validation/NEON/InstanceNormalizationLayer.cpp
+++ b/tests/validation/NEON/InstanceNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,11 @@ namespace
 /** Tolerance for float operations */
 AbsoluteTolerance<float> tolerance_f32(0.0015f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-AbsoluteTolerance<float> tolerance_f16(0.5f);
+// This precision is chosen based on the precision float16_t can provide
+// for the decimal numbers between 16 and 32 and decided based on multiple
+// times of execution of tests. Although, with randomly generated numbers
+// there is no gaurantee that this tolerance will be always large enough.
+AbsoluteTolerance<half> tolerance_f16(static_cast<half>(0.015625f));
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 } // namespace
 
@@ -119,7 +123,7 @@ TEST_SUITE_END() // FP16
 #endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE_END() // InstanceNormalizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/IntegralImage.cpp b/tests/validation/NEON/IntegralImage.cpp
deleted file mode 100644
index 4408acdcc4..0000000000
--- a/tests/validation/NEON/IntegralImage.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/IntegralImageFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(IntegralImage)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, DataType::U32);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEIntegralImage integral_image;
-    integral_image.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize src_padding = PaddingCalculator(shape.x(), 16).required_padding();
-    const PaddingSize dst_padding(1, src_padding.right, 0, 1);
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEIntegralImageFixture = IntegralImageValidationFixture<Tensor, Accessor, NEIntegralImage, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIntegralImageFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                             DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIntegralImageFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                           DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/L2NormalizeLayer.cpp b/tests/validation/NEON/L2NormalizeLayer.cpp
index 17147c1d50..56223c44d5 100644
--- a/tests/validation/NEON/L2NormalizeLayer.cpp
+++ b/tests/validation/NEON/L2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -99,8 +99,8 @@ using NEL2NormalizeLayerFixture = L2NormalizeLayerValidationFixture<Tensor, Acce
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       framework::dataset::make("Axis", { -1, 0, 2 })),
-                               framework::dataset::make("Epsilon", { 1e-12 })))
+                                       framework::dataset::make("Axis", { -1, 0, 1, 2 })),
+                               framework::dataset::make("Epsilon", { 1e-6 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -109,7 +109,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<float>, framework::Da
 FIXTURE_DATA_TEST_CASE(RunLarge, NEL2NormalizeLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                        framework::dataset::make("Axis", { -1, 0, 2 })),
-                               framework::dataset::make("Epsilon", { 1e-12 })))
+                               framework::dataset::make("Epsilon", { 1e-6 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -120,8 +120,8 @@ TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       framework::dataset::make("Axis", { -1, 0, 2 })),
-                               framework::dataset::make("Epsilon", { 1e-12 })))
+                                       framework::dataset::make("Axis", { -1, 0, 1, 2 })),
+                               framework::dataset::make("Epsilon", { 1e-6 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -130,7 +130,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<half>, framework::Dat
 FIXTURE_DATA_TEST_CASE(RunLarge, NEL2NormalizeLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                        framework::dataset::make("Axis", { -1, 0, 2 })),
-                               framework::dataset::make("Epsilon", { 1e-12 })))
+                               framework::dataset::make("Epsilon", { 1e-6 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -139,7 +139,7 @@ TEST_SUITE_END() // FP16
 #endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE_END() // L2NormalizeLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/LSTMLayer.cpp b/tests/validation/NEON/LSTMLayer.cpp
index 45beb36e60..c4645563bf 100644
--- a/tests/validation/NEON/LSTMLayer.cpp
+++ b/tests/validation/NEON/LSTMLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -134,9 +134,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                input_info, input_weights_info, recurrent_weights_info, cell_bias_info, projection_bias_info, cell_state_info, output_info, scratch_info, info, expected)
 {
     LSTMParams<ITensorInfo> lstm_params_info;
-    lstm_params_info.set_peephole_params(&cell_bias_info, &cell_bias_info)
+    auto cell_bias_clone = cell_bias_info.clone();
+    lstm_params_info.set_peephole_params(cell_bias_clone.get(), cell_bias_clone.get())
                     .set_projection_params(&recurrent_weights_info, &projection_bias_info)
-                    .set_cifg_params(&input_weights_info, &recurrent_weights_info, &cell_bias_info, &cell_bias_info);
+                    .set_cifg_params(&input_weights_info, &recurrent_weights_info, cell_bias_clone.get(), cell_bias_clone.get());
 
     ARM_COMPUTE_EXPECT(bool(NELSTMLayer::validate(&input_info.clone()->set_is_resizable(false), &input_weights_info.clone()->set_is_resizable(false), &input_weights_info.clone()->set_is_resizable(false),
                                                   &input_weights_info.clone()->set_is_resizable(false), &recurrent_weights_info.clone()->set_is_resizable(false), &recurrent_weights_info.clone()->set_is_resizable(false),
@@ -180,7 +181,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NELSTMLayerFixture<half>, framework::DatasetMod
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // LSTMLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/LSTMLayerQuantized.cpp b/tests/validation/NEON/LSTMLayerQuantized.cpp
index b57a8f7d26..6b98ee2b67 100644
--- a/tests/validation/NEON/LSTMLayerQuantized.cpp
+++ b/tests/validation/NEON/LSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,11 +64,7 @@ inline void fill_tensor(SimpleTensor<T> &tensor, const std::vector<T> &v)
 }
 
 /** Tolerance for quantized asymmetric operations */
-#if defined(__aarch64__)
-constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(0);
-#else  // defined(__aarch64__)
 constexpr AbsoluteTolerance<int16_t> tolerance_qsymm16(1);
-#endif // defined(__aarch64__)
 
 } // namespace
 
@@ -602,7 +598,7 @@ TEST_SUITE_END() // IntegrationTestCase
 // *INDENT-ON*
 
 TEST_SUITE_END() // LSTMLayerQuantized
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/LaplacianPyramid.cpp b/tests/validation/NEON/LaplacianPyramid.cpp
deleted file mode 100644
index 456ae75494..0000000000
--- a/tests/validation/NEON/LaplacianPyramid.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-*/
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/LaplacianPyramidFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto small_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 3);
-const auto large_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 5);
-
-const auto formats = combine(framework::dataset::make("FormatIn", Format::U8), framework::dataset::make("FormatOut", Format::S16));
-
-template <typename T>
-inline void validate_laplacian_pyramid(const Pyramid &target, const std::vector<SimpleTensor<T>> &reference, BorderMode border_mode)
-{
-    Tensor     *level_image  = target.get_pyramid_level(0);
-    ValidRegion valid_region = shape_to_valid_region(reference[0].shape(), border_mode == BorderMode::UNDEFINED, BorderSize(2));
-
-    // Validate lowest level
-    validate(Accessor(*level_image), reference[0], valid_region);
-
-    // Validate remaining levels
-    for(size_t lev = 1; lev < target.info()->num_levels(); lev++)
-    {
-        level_image              = target.get_pyramid_level(lev);
-        Tensor *prev_level_image = target.get_pyramid_level(lev - 1);
-
-        valid_region = shape_to_valid_region_laplacian_pyramid(prev_level_image->info()->tensor_shape(),
-                                                               prev_level_image->info()->valid_region(),
-                                                               border_mode == BorderMode::UNDEFINED);
-
-        // Validate level
-        validate(Accessor(*level_image), reference[lev], valid_region);
-    }
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(LaplacianPyramid)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(
-                                                           concat(datasets::Medium2DShapes(), datasets::Large2DShapes()),
-                                                           datasets::BorderModes()),
-                                                           large_laplacian_pyramid_levels),
-                                                           shape, border_mode, num_levels)
-{
-    // Create pyramid info
-    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::S16);
-    Pyramid     dst_pyramid{};
-    dst_pyramid.init(pyramid_info);
-
-    // Create Tensors
-    Tensor src = create_tensor<Tensor>(shape, Format::U8);
-
-    // The first two dimensions of the output tensor must match the first two
-    // dimensions of the tensor in the last level of the pyramid
-    TensorShape dst_shape(shape);
-    dst_shape.set(0, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(0));
-    dst_shape.set(1, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(1));
-    Tensor dst = create_tensor<Tensor>(dst_shape, Format::S16);
-
-    // Create and configure function
-    NELaplacianPyramid laplacian_pyramid;
-    laplacian_pyramid.configure(&src, &dst_pyramid, &dst, border_mode, 0);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
-    {
-        ARM_COMPUTE_EXPECT(dst_pyramid.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-}
-
-using NELaplacianPyramidFixture = LaplacianPyramidValidationFixture<Tensor, Accessor, NELaplacianPyramid, uint8_t, int16_t, Pyramid>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NELaplacianPyramidFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(
-                       datasets::Medium2DShapes(),
-                       datasets::BorderModes()),
-                       small_laplacian_pyramid_levels),
-                       formats))
-{
-    validate_laplacian_pyramid(_target, _reference, _border_mode);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NELaplacianPyramidFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Large2DShapes(),
-                       datasets::BorderModes()),
-                       large_laplacian_pyramid_levels),
-                       formats))
-{
-    validate_laplacian_pyramid(_target, _reference, _border_mode);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/LaplacianReconstruct.cpp b/tests/validation/NEON/LaplacianReconstruct.cpp
deleted file mode 100644
index b41449ea61..0000000000
--- a/tests/validation/NEON/LaplacianReconstruct.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-*/
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
-#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
-#include "arm_compute/runtime/Pyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/LaplacianReconstructFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto small_laplacian_reconstruct_levels = framework::dataset::make("NumLevels", 2, 3);
-const auto large_laplacian_reconstruct_levels = framework::dataset::make("NumLevels", 2, 5);
-
-const auto formats = combine(framework::dataset::make("FormatIn", Format::S16), framework::dataset::make("FormatOut", Format::U8));
-
-template <typename T>
-void validate_laplacian_reconstruct(Tensor &target, const SimpleTensor<T> &reference, BorderMode border_mode, size_t num_levels)
-{
-    const unsigned int filter_size = 5;
-    const unsigned int border_size(filter_size / 2);
-
-    BorderSize border(std::pow(border_size, num_levels));
-
-    // Validate output
-    ValidRegion valid_region = shape_to_valid_region(reference.shape(), border_mode == BorderMode::UNDEFINED, border);
-    validate(Accessor(target), reference, valid_region);
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(LaplacianReconstruct)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(
-                                                           concat(datasets::Medium2DShapes(), datasets::Large2DShapes()),
-                                                           datasets::BorderModes()),
-                                                           large_laplacian_reconstruct_levels),
-                                                           shape, border_mode, num_levels)
-{
-    // Create pyramid info
-    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::S16);
-    Pyramid   dst_pyramid{};
-    dst_pyramid.init(pyramid_info);
-
-    // Create Tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-
-    // The first two dimensions of the output tensor must match the first two
-    // dimensions of the tensor in the last level of the pyramid
-    TensorShape dst_shape(shape);
-    dst_shape.set(0, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(0));
-    dst_shape.set(1, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(1));
-    Tensor dst = create_tensor<Tensor>(dst_shape, DataType::S16);
-
-    // The dimensions of the reconstruct are the same as the src shape
-    Tensor rec_dst = create_tensor<Tensor>(shape, DataType::U8);
-
-    // Create and configure pyramid function
-    NELaplacianPyramid laplacian_pyramid;
-    laplacian_pyramid.configure(&src, &dst_pyramid, &dst, border_mode, 0);
-
-    // Create and configure reconstruct function
-    NELaplacianReconstruct laplacian_reconstruct;
-    laplacian_reconstruct.configure(&dst_pyramid, &dst, &rec_dst, border_mode, 0);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
-    {
-        ARM_COMPUTE_EXPECT(dst_pyramid.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-
-    ARM_COMPUTE_EXPECT(rec_dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-}
-
-using NELaplacianReconstructFixture = LaplacianReconstructValidationFixture<Tensor, Accessor, NELaplacianReconstruct, NELaplacianPyramid, int16_t, uint8_t, Pyramid>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NELaplacianReconstructFixture, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(
-                       datasets::Medium2DShapes(),
-                       datasets::BorderModes()),
-                       small_laplacian_reconstruct_levels),
-                       formats))
-{
-    validate_laplacian_reconstruct(_target, _reference, _border_mode, _pyramid_levels);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NELaplacianReconstructFixture, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(
-                       datasets::Large2DShapes(),
-                       datasets::BorderModes()),
-                       large_laplacian_reconstruct_levels),
-                       formats))
-{
-    validate_laplacian_reconstruct(_target, _reference, _border_mode, _pyramid_levels);
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/LocallyConnected.cpp b/tests/validation/NEON/LocallyConnected.cpp
deleted file mode 100644
index d58ca45110..0000000000
--- a/tests/validation/NEON/LocallyConnected.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/LocallyConnectedDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/LocallyConnectedFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr RelativeTolerance<float> tolerance_f32(0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(LocallyConnected)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo",  { TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching data type input/weights
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching data type input/bias
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching data type input/output
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/weights
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/bias
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/output
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Asymmetric padding
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32)
-                                           }),
-    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 274U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32)
-                                           })),
-    framework::dataset::make("BiasInfo",   { TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 274U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32)
-                                           })),
-    framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 22U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32)
-                                           })),
-    framework::dataset::make("PadStride",  { PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 1, 0, 0, 0, DimensionRoundingType::FLOOR),
-                                             PadStrideInfo(2, 1, 0, 0)
-                                           })),
-    framework::dataset::make("Expected", { false, false, false, false, false, false, false, true })),
-    input_info, weights_info, bias_info, output_info, conv_info, expected)
-{
-    bool is_valid = bool(NELocallyConnectedLayer::validate(&input_info.clone()->set_is_resizable(false),
-                                                           &weights_info.clone()->set_is_resizable(false),
-                                                           &bias_info.clone()->set_is_resizable(false),
-                                                           &output_info.clone()->set_is_resizable(false),
-                                                           conv_info));
-    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallLocallyConnectedDataset(), datasets::LargeLocallyConnectedDataset()),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-               src_shape, weights_shape, bias_shape, dst_shape, info, dilation, data_type)
-{
-    ARM_COMPUTE_UNUSED(dilation);
-
-    // Create tensors
-    Tensor src     = create_tensor<Tensor>(src_shape, data_type);
-    Tensor weights = create_tensor<Tensor>(weights_shape, data_type);
-    Tensor bias    = create_tensor<Tensor>(bias_shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(dst_shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function.
-    NELocallyConnectedLayer lc;
-    lc.configure(&src, &weights, &bias, &dst, info);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(dst_shape);
-    validate(dst.info()->valid_region(), dst_valid_region);
-}
-
-template <typename T>
-using NELocallyConnectedFixture = LocallyConnectedValidationFixture<Tensor, Accessor, NELocallyConnectedLayer, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NELocallyConnectedFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallLocallyConnectedDataset(),
-                                                                                                              framework::dataset::make("DataType",
-                                                                                                                      DataType::F32)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NELocallyConnectedFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeLocallyConnectedDataset(),
-                                                                                                              framework::dataset::make("DataType",
-                                                                                                                      DataType::F32)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/LogSoftmaxLayer.cpp b/tests/validation/NEON/LogSoftmaxLayer.cpp
index e35c8fd8a2..a7ab033359 100644
--- a/tests/validation/NEON/LogSoftmaxLayer.cpp
+++ b/tests/validation/NEON/LogSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,23 +71,23 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 1 })))
+                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
                                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                              framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                              framework::dataset::make("Axis", { 0, -3, 2 })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
+    validate(Accessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                          framework::dataset::make("Axis", { 1 })))
+                                                                                                          framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -99,7 +99,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                               framework::dataset::make("Axis", { 1 })))
+                                                                                                               framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -107,7 +107,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerFixture<float>, framework::D
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                               framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                               framework::dataset::make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -115,7 +115,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<float>, framework::D
 FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 1 })))
+                                                                                                           framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -132,7 +132,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerQuantizedFixture<uint8_t>, f
                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                     combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                    framework::dataset::make("Axis", { 1 })))
+                                                                                                                    framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -141,7 +141,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerQuantizedFixture<uint8_t>, f
                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                     combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                    framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                                    framework::dataset::make("Axis", { 0, -1, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -150,7 +150,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerQuantizedFixture<uint8_t>, fra
                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                       combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                      framework::dataset::make("Axis", { 1 })))
+                                                                                                                      framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
diff --git a/tests/validation/NEON/Logical.cpp b/tests/validation/NEON/Logical.cpp
new file mode 100644
index 0000000000..271d22f877
--- /dev/null
+++ b/tests/validation/NEON/Logical.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NELogical.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/LogicalFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+
+TEST_SUITE(LogicalAnd)
+template <typename T>
+using NELogicalAndFixture = LogicalAndValidationFixture<Tensor, Accessor, NELogicalAnd, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogicalAndFixture<uint8_t>, framework::DatasetMode::ALL, zip(datasets::SmallShapes(), datasets::SmallShapes()))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NELogicalAndFixture<uint8_t>, framework::DatasetMode::ALL, datasets::SmallShapesBroadcast())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // LogicalAnd
+
+TEST_SUITE(LogicalOr)
+template <typename T>
+using NELogicalOrFixture = LogicalOrValidationFixture<Tensor, Accessor, NELogicalOr, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogicalOrFixture<uint8_t>, framework::DatasetMode::ALL, zip(datasets::SmallShapes(), datasets::SmallShapes()))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NELogicalOrFixture<uint8_t>, framework::DatasetMode::ALL, datasets::SmallShapesBroadcast())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // LogicalOr
+
+TEST_SUITE(LogicalNot)
+
+template <typename T>
+using NELogicalNotFixture = LogicalNotValidationFixture<Tensor, Accessor, NELogicalNot, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogicalNotFixture<uint8_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                    DataType::U8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // LogicalNot
+TEST_SUITE_END() // Neon
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/Magnitude.cpp b/tests/validation/NEON/Magnitude.cpp
deleted file mode 100644
index 61acc90d85..0000000000
--- a/tests/validation/NEON/Magnitude.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/MagnitudeFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-template <typename T>
-AbsoluteTolerance<T> tolerance(MagnitudeType magnitude_type)
-{
-    return AbsoluteTolerance<T>((MagnitudeType::L1NORM == magnitude_type) ? 0 : 1);
-}
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Magnitude)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S16)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function (default MagnitudeType::L2NORM)
-    NEMagnitude magnitude;
-    magnitude.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-template <typename T>
-using NEMagnitudeFixture = MagnitudeValidationFixture<Tensor, Accessor, NEMagnitude, T>;
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMagnitudeFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::Small2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                         framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
-
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance<int16_t>(_magnitude_type));
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMagnitudeFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Large2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                       framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance<int16_t>(_magnitude_type));
-}
-TEST_SUITE_END() // S16
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/MatMul.cpp b/tests/validation/NEON/MatMul.cpp
new file mode 100644
index 0000000000..f22bd9e86a
--- /dev/null
+++ b/tests/validation/NEON/MatMul.cpp
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/validation/fixtures/MatMulFixture.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using framework::dataset::make;
+
+TEST_SUITE(NEON)
+TEST_SUITE(MatMul)
+
+constexpr AbsoluteTolerance<float> tolerance_fp32(
+    0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */
+const AbsoluteTolerance<half> tolerance_fp16(half(0.1f));
+#ifdef __aarch64__
+constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8(1);
+constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8_signed(1);
+#endif // __aarch64__
+
+// clang-format off
+// *INDENT-OFF*
+// Validation Tests
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL,
+    zip(
+        make("InputAInfo", {
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Mismatching datatype
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::S32),        // Unsupported datatypes
+            TensorInfo(TensorShape(9U, 6U, 2U), 1, DataType::F32),    // Broadcasting in batch dimension not supported
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Invalid shape for multiplication
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32), // Tensors are not dynamic
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED), // Mismatching data type
+        }),
+        make("InputBInfo", {
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 9U, 1U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+        }),
+        make("OutputInfo", {
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 6U, 2U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+        }),
+        make("TensorIsConst", {false, false, false, false, false , false, true, false, false, false}),
+        make("Expected", { false, false, false, false, true, true, false, true, true, false })),
+    a_info, b_info, output_info, are_tensors_const, expected)
+{
+    TensorInfo a{a_info};
+    TensorInfo b{b_info};
+    a.set_are_values_constant(are_tensors_const);
+    b.set_are_values_constant(are_tensors_const);
+    Status status =  NEMatMul::validate(&a,
+                                        &b,
+                                        &output_info,
+                                        MatMulInfo(),
+                                        CpuMatMulSettings());
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+// *INDENT-ON*
+// clang-format on
+
+// Generic Template
+template <typename T>
+using NEMatMulFixture = MatMulValidationWithActivationFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+// Fast math Template
+template <typename T>
+using NEMatMulFastMathFixture = MatMulGenericValidationFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+template <typename T>
+using NEMatMulFixedFormatFixture = MatMulFixedFormatFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+template <typename T>
+using NEMatMulDynamicTensorsFixture =
+    MatMulValidationWithDynamicTensorsFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+template <typename T>
+using NEQuantizedMatMulFixture = QuantizedMatMulValidationFixture<Tensor, Accessor, NEMatMul, CpuMatMulSettings, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEMatMulFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEMatMulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+FIXTURE_DATA_TEST_CASE(RunHighDimensions,
+                       NEMatMulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::HighDimensionalMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors,
+                       NEMatMulDynamicTensorsFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfRuns", 5)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32);
+}
+TEST_SUITE_END() // FP32
+
+#ifdef ARM_COMPUTE_ENABLE_BF16
+/* Note : MatMul BF16 is enabled by specifying FP32 datatype and enabling the fast math setting */
+constexpr AbsoluteTolerance<float> tolerance_bf16(0.02f);
+TEST_SUITE(BF16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEMatMulFastMathFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo", {ActivationLayerInfo()}),
+                               make("RunTimes", {0}),
+                               make("Settings", {CpuMatMulSettings().fast_math(true)}),
+                               make("LhsQInfo", {QuantizationInfo()}),
+                               make("RhsQInfo", {QuantizationInfo()}),
+                               make("OutQInfo", {QuantizationInfo()})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_bf16);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+FIXTURE_DATA_TEST_CASE(RunTinyFixedFormat,
+                       NEMatMulFixedFormatFixture<bfloat16>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::TinyMatMulDataset(),
+                               make("TransposeA", {false}),
+                               make("TransposeB", {false}),
+                               make("DataType", DataType::BFLOAT16),
+                               make("ActivationInfo", {ActivationLayerInfo()}),
+                               make("RunTimes", {0}),
+                               make("Settings", {CpuMatMulSettings().fast_math(true).fixed_format(true)}),
+                               make("LhsQInfo", {QuantizationInfo()}),
+                               make("RhsQInfo", {QuantizationInfo()}),
+                               make("OutQInfo", {QuantizationInfo()})))
+{
+    if (CPUInfo::get().has_bf16())
+    {
+        // Validate output
+        validate(Accessor(_target), _reference, tolerance_bf16);
+    }
+}
+#endif /* ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS */
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEMatMulFastMathFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F32),
+                               make("ActivationInfo", {ActivationLayerInfo()}),
+                               make("RunTimes", {0}),
+                               make("Settings", {CpuMatMulSettings().fast_math(true)}),
+                               make("LhsQInfo", {QuantizationInfo()}),
+                               make("RhsQInfo", {QuantizationInfo()}),
+                               make("OutQInfo", {QuantizationInfo()})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_bf16, 0.01 /* tolerance_num */);
+}
+TEST_SUITE_END() // BF16
+#endif           /* ARM_COMPUTE_ENABLE_BF16 */
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEMatMulFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F16),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEMatMulFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F16),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+FIXTURE_DATA_TEST_CASE(RunStressDynamicTensors,
+                       NEMatMulDynamicTensorsFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::F16),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfRuns", 5)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp16);
+}
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE_END() // Float
+
+#ifdef __aarch64__ // All the GeMM CPU assembly kernels for integer datatypes require aarch64
+TEST_SUITE(Quantized)
+
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEQuantizedMatMulFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 30, -1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 2)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation,
+                       NEQuantizedMatMulFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::SmallerMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 30, -1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 2)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEQuantizedMatMulFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 100, 1)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 200, -1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 2)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEQuantizedMatMulFixture<int8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8_SIGNED),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 40, -2)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 1)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallExtraActivation,
+                       NEQuantizedMatMulFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::SmallerMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8_SIGNED),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 40, -2)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 50, 1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 1)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEQuantizedMatMulFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulDataset(),
+                               make("TransposeA", {false, true}),
+                               make("TransposeB", {false, true}),
+                               make("DataType", DataType::QASYMM8_SIGNED),
+                               make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+}),
+make("NumberOfExtraRuns", {0, 1}),
+make("LhsQInfo", {QuantizationInfo(1.f / 150, -2)}),
+make("RhsQInfo", {QuantizationInfo(1.f / 250, 1)}),
+make("OutQInfo", {QuantizationInfo(1.f, 1)})))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
+#endif           // __aarch64__
+
+TEST_SUITE_END() // MatMul
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/MaxUnpoolingLayer.cpp b/tests/validation/NEON/MaxUnpoolingLayer.cpp
new file mode 100644
index 0000000000..0eb021fe71
--- /dev/null
+++ b/tests/validation/NEON/MaxUnpoolingLayer.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/MaxUnpoolingLayerFixture.h"
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(PoolingLayer)
+
+template <typename T>
+using NEMaxUnpoolingLayerFixture = MaxUnpoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, NEMaxUnpoolingLayer, T>;
+
+const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                                       framework::dataset::make("PadStride", { PadStrideInfo(2, 2, 0, 0), PadStrideInfo(2, 1, 0, 0) }));
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                   framework::dataset::make("DataType", DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
+
+                                                                                                                  ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // FP32
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                  framework::dataset::make("DataType", DataType::F16))),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
+
+                                                                                                                 ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE_END() // Float
+
+TEST_SUITE(KernelSelection)
+
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+               combine(framework::dataset::make("CpuExt", std::string("NEON")),
+                       framework::dataset::make("DataType", { DataType::F32,
+                                                              DataType::F16,
+                                                              DataType::QASYMM8,
+                                                              DataType::QASYMM8_SIGNED
+                                                            })),
+               cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "NEON");
+    cpu_isa.sve  = (cpu_ext == "SVE");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuMaxUnpoolingLayerKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_maxunpooling";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // KernelSelection
+TEST_SUITE_END() // PoolingLayer
+TEST_SUITE_END() // Neon
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/MeanStdDev.cpp b/tests/validation/NEON/MeanStdDev.cpp
deleted file mode 100644
index d8b3fd88cd..0000000000
--- a/tests/validation/NEON/MeanStdDev.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/MeanStdDevFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-RelativeTolerance<float> tolerance_rel_high_error(0.05f);
-RelativeTolerance<float> tolerance_rel_low_error(0.0005f);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(MeanStdDev)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-
-    // Create output variables
-    float mean    = 0.f;
-    float std_dev = 0.f;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create configure function
-    NEMeanStdDev mean_std_dev_image;
-    mean_std_dev_image.configure(&src, &mean, &std_dev);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-}
-
-template <typename T>
-using NEMeanStdDevFixture = MeanStdDevValidationFixture<Tensor, Accessor, NEMeanStdDev, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                          DataType::U8)))
-{
-    // Validate mean output
-    validate(_target.first, _reference.first);
-
-    // Validate std_dev output
-    validate(_target.second, _reference.second, tolerance_rel_high_error);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMeanStdDevFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                        DataType::U8)))
-{
-    // Validate mean output
-    validate(_target.first, _reference.first, tolerance_rel_low_error);
-
-    // Validate std_dev output
-    validate(_target.second, _reference.second, tolerance_rel_high_error);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
index 132700929c..085f3608a0 100644
--- a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,8 @@ namespace
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<half> tolerance_f16(half(0.2f));
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-RelativeTolerance<float> tolerance_f32(1e-8f);
+RelativeTolerance<float>   tolerance_f32(1e-4f);
+RelativeTolerance<uint8_t> tolerance_qasymm8(1);
 } // namespace
 
 TEST_SUITE(NEON)
@@ -81,7 +82,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F16)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -101,7 +102,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-7 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -114,11 +115,25 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEMeanStdDevNormalizationLayerFixture<float>, f
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("InPlace", { false, true })),
+                       framework::dataset::make("Epsilon", { 1e-7 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // QASYMM8
+
 TEST_SUITE_END() // MeanStdNormalizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Median3x3.cpp b/tests/validation/NEON/Median3x3.cpp
deleted file mode 100644
index 4b0f639b4d..0000000000
--- a/tests/validation/NEON/Median3x3.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/Median3x3Fixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr unsigned int filter_size = 3;              /* Size of the kernel/filter in number of elements. */
-constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kernel/filter around its central element. */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Median3x3)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEMedian3x3 median3x3;
-    median3x3.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
-template <typename T>
-using NEMedian3x3Fixture = Median3x3ValidationFixture<Tensor, Accessor, NEMedian3x3, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMedian3x3Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                         datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMedian3x3Fixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                               DataType::U8)),
-                                                                                                       datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/MinMaxLocation.cpp b/tests/validation/NEON/MinMaxLocation.cpp
deleted file mode 100644
index bc08666c5f..0000000000
--- a/tests/validation/NEON/MinMaxLocation.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/MinMaxLocationFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(MinMaxLocation)
-
-template <typename T>
-using NEMinMaxLocationFixture = MinMaxLocationValidationFixture<Tensor, Accessor, Array<Coordinates2D>, ArrayAccessor<Coordinates2D>, NEMinMaxLocation, T>;
-
-void validate_configuration(const Tensor &src, TensorShape shape)
-{
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create output storage
-    int32_t            min{};
-    int32_t            max{};
-    Coordinates2DArray min_loc(shape.total_size());
-    Coordinates2DArray max_loc(shape.total_size());
-
-    // Create and configure function
-    NEMinMaxLocation min_max_loc;
-    min_max_loc.configure(&src, &min, &max, &min_loc, &max_loc);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 1).required_padding();
-    validate(src.info()->padding(), padding);
-}
-
-TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    src.info()->set_format(Format::U8);
-
-    validate_configuration(src, shape);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMinMaxLocationFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                              DataType::U8)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMinMaxLocationFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::U8)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::S16)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    src.info()->set_format(Format::S16);
-
-    validate_configuration(src, shape);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMinMaxLocationFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                              DataType::S16)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMinMaxLocationFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::S16)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-TEST_SUITE_END() // S16
-
-TEST_SUITE(Float)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    src.info()->set_format(Format::F32);
-
-    validate_configuration(src, shape);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEMinMaxLocationFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                                                                                                            DataType::F32)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEMinMaxLocationFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                                                                                                          DataType::F32)))
-{
-    validate_min_max_loc(_target, _reference);
-}
-
-TEST_SUITE_END() // F32
-
-TEST_SUITE_END() // MinMaxLocation
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/NonLinearFilter.cpp b/tests/validation/NEON/NonLinearFilter.cpp
deleted file mode 100644
index 08ad1e2407..0000000000
--- a/tests/validation/NEON/NonLinearFilter.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/MatrixPatternDataset.h"
-#include "tests/datasets/NonLinearFilterFunctionDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/NonLinearFilterFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(NonLinearFilter)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), datasets::NonLinearFilterFunctions()),
-                                                                                   framework::dataset::make("MaskSize", { 3U, 5U })),
-                                                                           datasets::MatrixPatterns()),
-                                                                   datasets::BorderModes()),
-               shape, function, mask_size, pattern, border_mode)
-{
-    std::mt19937                           generator(library->seed());
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    const uint8_t                          constant_border_value = distribution_u8(generator);
-
-    // Create the mask
-    std::vector<uint8_t> mask(mask_size * mask_size);
-    fill_mask_from_pattern(mask.data(), mask_size, mask_size, pattern);
-    const auto half_mask_size = static_cast<int>(mask_size / 2);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, DataType::U8);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NENonLinearFilter filter;
-    filter.configure(&src, &dst, function, mask_size, pattern, mask.data(), border_mode, constant_border_value);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, BorderSize(half_mask_size));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), ((MatrixPattern::OTHER == pattern) ? 1 : 8));
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(half_mask_size);
-
-    const PaddingSize write_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-half_mask_size);
-
-    const PaddingSize read_padding = calculator.required_padding(PaddingCalculator::Option::INCLUDE_BORDER);
-
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
-}
-
-template <typename T>
-using NENonLinearFilterFixture = NonLinearFilterValidationFixture<Tensor, Accessor, NENonLinearFilter, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NENonLinearFilterFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                       datasets::NonLinearFilterFunctions()),
-                                                                                                                       framework::dataset::make("MaskSize", { 3U, 5U })),
-                                                                                                                       datasets::MatrixPatterns()),
-                                                                                                                       datasets::BorderModes()),
-                                                                                                               framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), _border_size));
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NENonLinearFilterFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                     datasets::NonLinearFilterFunctions()),
-                                                                                                                     framework::dataset::make("MaskSize", { 3U, 5U })),
-                                                                                                                     datasets::MatrixPatterns()),
-                                                                                                                     datasets::BorderModes()),
-                                                                                                             framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, shape_to_valid_region(_reference.shape(), (_border_mode == BorderMode::UNDEFINED), _border_size));
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/NormalizationLayer.cpp b/tests/validation/NEON/NormalizationLayer.cpp
index 20dcafb719..7260d1044e 100644
--- a/tests/validation/NEON/NormalizationLayer.cpp
+++ b/tests/validation/NEON/NormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,8 +61,6 @@ const auto NormalizationDatasetFP32 = combine(combine(combine(datasets::Normaliz
 TEST_SUITE(NEON)
 TEST_SUITE(NormalizationLayer)
 
-//TODO(COMPMID-415): Missing configuration?
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -70,24 +68,21 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching shapes
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Even normalization
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Non implemented IN_MAP_2D
-                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Window shrink
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                           }),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(27U, 11U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                           })),
     framework::dataset::make("NormInfo",  { NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                             NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                             NormalizationLayerInfo(NormType::IN_MAP_1D, 4),
                                             NormalizationLayerInfo(NormType::IN_MAP_2D, 5),
-                                            NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                             NormalizationLayerInfo(NormType::CROSS_MAP, 1),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, true, true })),
     input_info, output_info, norm_info, expected)
 {
     bool is_valid = bool(NENormalizationLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), norm_info));
@@ -102,16 +97,9 @@ using NENormalizationLayerFixture = NormalizationValidationFixture<Tensor, Acces
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(NormalizationDataset,
-                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
-                                                                                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(NormalizationDataset,
-                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
-                                                                                                             framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(NormalizationDataset,
+                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
+                                                                                                         framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -138,7 +126,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // NormalizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/OpticalFlow.cpp b/tests/validation/NEON/OpticalFlow.cpp
deleted file mode 100644
index 1f4bf5fddf..0000000000
--- a/tests/validation/NEON/OpticalFlow.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
-#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
-#include "arm_compute/runtime/Pyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/ArrayAccessor.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/OpticalFlowDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/OpticalFlowFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(OpticalFlow)
-
-// *INDENT-OFF*
-// clang-format off
-using NEOpticalFlowFixture = OpticalFlowValidationFixture<Tensor,
-                                                          Accessor,
-                                                          KeyPointArray,
-                                                          ArrayAccessor<KeyPoint>,
-                                                          NEOpticalFlow,
-                                                          Pyramid,
-                                                          NEGaussianPyramidHalf,
-                                                          uint8_t>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEOpticalFlowFixture, framework::DatasetMode::PRECOMMIT, combine(combine(
-                       datasets::SmallOpticalFlowDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       datasets::BorderModes()))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end());
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEOpticalFlowFixture, framework::DatasetMode::NIGHTLY, combine(combine(
-                       datasets::LargeOpticalFlowDataset(),
-                       framework::dataset::make("Format", Format::U8)),
-                       datasets::BorderModes()))
-{
-    // Validate output
-    ArrayAccessor<KeyPoint> array(_target);
-
-    validate_keypoints(array.buffer(),
-                       array.buffer() + array.num_values(),
-                       _reference.begin(),
-                       _reference.end());
-}
-// clang-format on
-// *INDENT-ON*
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/PReluLayer.cpp b/tests/validation/NEON/PReluLayer.cpp
index bc93c65778..69a05175ad 100644
--- a/tests/validation/NEON/PReluLayer.cpp
+++ b/tests/validation/NEON/PReluLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -206,7 +206,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // PReluLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/PadLayer.cpp b/tests/validation/NEON/PadLayer.cpp
index 5049347f27..4947af3423 100644
--- a/tests/validation/NEON/PadLayer.cpp
+++ b/tests/validation/NEON/PadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -188,7 +188,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // PadLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Permute.cpp b/tests/validation/NEON/Permute.cpp
index 07578d3896..d897bbbe07 100644
--- a/tests/validation/NEON/Permute.cpp
+++ b/tests/validation/NEON/Permute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,11 @@ namespace validation
 {
 namespace
 {
+const auto PermuteVectors2 = framework::dataset::make("PermutationVector",
+{
+    PermutationVector(0U, 1U),
+    PermutationVector(1U, 0U),
+});
 const auto PermuteVectors3 = framework::dataset::make("PermutationVector",
 {
     PermutationVector(2U, 0U, 1U),
@@ -61,7 +66,7 @@ const auto PermuteVectors4 = framework::dataset::make("PermutationVector",
     PermutationVector(3U, 0U, 2U, 1U),
     PermutationVector(0U, 3U, 2U, 1U)
 });
-const auto PermuteVectors         = concat(PermuteVectors3, PermuteVectors4);
+const auto PermuteVectors         = concat(concat(PermuteVectors2, PermuteVectors3), PermuteVectors4);
 const auto PermuteParametersSmall = concat(concat(datasets::Small2DShapes(), datasets::Small3DShapes()), datasets::Small4DShapes()) * PermuteVectors;
 const auto PermuteParametersLarge = datasets::Large4DShapes() * PermuteVectors;
 } // namespace
@@ -71,7 +76,7 @@ TEST_SUITE(Permute)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                framework::dataset::make("InputInfo",{  
+                                                framework::dataset::make("InputInfo",{
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
@@ -85,26 +90,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                                                         TensorInfo(TensorShape(27U, 13U, 37U, 2U), 1, DataType::F32)  // permutation not supported
 
                                                                                     }),
-                                                framework::dataset::make("OutputInfo", { 
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),     
-                                                                                        TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                                                                                        TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),
                                                                                         TensorInfo(TensorShape(5U, 7U), 1, DataType::U8),
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16), 
-                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),  
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16), 
-                                                                                        TensorInfo(TensorShape(3U, 5U, 7U, 7U), 1, DataType::S16), 
-                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),  
+                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                                                                                        TensorInfo(TensorShape(3U, 5U, 7U, 7U), 1, DataType::S16),
+                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32)
 
                                                                                     })),
-                                                framework::dataset::make("PermutationVector", { 
+                                                framework::dataset::make("PermutationVector", {
                                                                                                 PermutationVector(2U, 1U, 0U),
                                                                                                 PermutationVector(2U, 2U, 1U),
                                                                                                 PermutationVector(1U, 1U, 1U),
                                                                                                 PermutationVector(2U, 0U, 1U),
-                                                                                                PermutationVector(2U, 0U, 1U), 
+                                                                                                PermutationVector(2U, 0U, 1U),
                                                                                                 PermutationVector(1U, 2U, 0U),
                                                                                                 PermutationVector(3U, 2U, 0U, 1U),
                                                                                                 PermutationVector(3U, 2U, 0U, 1U),
@@ -120,29 +125,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small4DShapes(), framework::dataset::make("DataType", { DataType::S8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32 })),
-               shape, data_type)
-{
-    // Define permutation vector
-    const PermutationVector perm(2U, 0U, 1U);
-
-    // Permute shapes
-    TensorShape output_shape = shape;
-    permute(output_shape, perm);
-
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(output_shape, data_type);
-
-    // Create and Configure function
-    NEPermute perm_func;
-    perm_func.configure(&ref_src, &dst, perm);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(output_shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEPermuteFixture = PermuteValidationFixture<Tensor, Accessor, NEPermute, T>;
 
diff --git a/tests/validation/NEON/Phase.cpp b/tests/validation/NEON/Phase.cpp
deleted file mode 100644
index 92ed9f716f..0000000000
--- a/tests/validation/NEON/Phase.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEPhase.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/PhaseFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Phase)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S16)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, DataType::U8);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEPhase phase;
-    phase.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-template <typename T>
-using NEPhaseFixture = PhaseValidationFixture<Tensor, Accessor, NEPhase, T>;
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPhaseFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::Small2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                     framework::dataset::make("PhaseType", { PhaseType::UNSIGNED, PhaseType::SIGNED })))
-{
-    // Validate output
-    validate_wrap(Accessor(_target), _reference, tolerance_value, 0);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPhaseFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::Large2DShapes(), framework::dataset::make("Format", Format::S16)),
-                                                                                                   framework::dataset::make("PhaseType", { PhaseType::UNSIGNED, PhaseType::SIGNED })))
-{
-    // Validate output
-    validate_wrap(Accessor(_target), _reference, tolerance_value, 0);
-}
-TEST_SUITE_END() // S16
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 6a75b00b9b..964d1c5deb 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,6 +56,11 @@ const auto PixelWiseMultiplicationQASYMM8QuantDataset = combine(combine(
                                                                     framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
                                                                 framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 32768.f, 0) }));
 
+const auto PixelWiseMultiplicationQASYMM8QuantInPlaceDataset = combine(combine(
+                                                                           framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 32768.f, 10) }),
+                                                                           framework::dataset::make("Src1QInfo", { QuantizationInfo(5.f / 32768.f, 10) })),
+                                                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 10) }));
+
 const auto PixelWiseMultiplicationPolicySTNUDataset = combine(
                                                           framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE }),
                                                           framework::dataset::make("RoundingPolicy", { RoundingPolicy::TO_NEAREST_UP }));
@@ -64,21 +69,37 @@ const auto PixelWiseMultiplicationPolicySTZDataset = combine(
                                                          framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE }),
                                                          framework::dataset::make("RoundingPolicy", { RoundingPolicy::TO_ZERO }));
 
+/** Tests for in-place computation
+ * With current interface storing TensorInfo with quantization information
+ * in the kernel, it is difficult to have different tensor metadata
+ * (e.g., quantization information, data type, different shape for broadcasting)
+ * when an input is used as the output of the computation.
+ * So, the following dataset for in-place computation is used only when
+ * the exact same input and output Tensor object makes sense
+ * (i.e., all the tensor metadata is the same) whereas if output is
+ * expected to have either different quantization information, data type
+ * or different shape we are not testing in-place computation.
+ */
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
+
 #define DEFAULT_VALIDATE validate(Accessor(_target), _reference);
 #define VALIDATE(TYPE, TOLERANCE) validate(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
 #define WRAP_VALIDATE(TYPE, TOLERANCE) validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
 
 // *INDENT-OFF*
 // clang-format off
-#define PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, SCALE, RP, VALIDATE) \
-    FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE,                   \
-                           combine(combine(combine(combine(combine(                                                       \
+#define PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, DT3, SCALE, RP, INPLACE_DATASET, VALIDATE) \
+    FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE,                        \
+                           combine(combine(combine(combine(combine(combine(combine(                                            \
                            datasets::SHAPES,                                                                              \
                            framework::dataset::make("DataType1", DataType::DT1)),                                         \
                            framework::dataset::make("DataType2", DataType::DT2)),                                         \
+                           framework::dataset::make("DataType3", DataType::DT3)),                                         \
                            framework::dataset::make("Scale", std::move(SCALE))),                                          \
                            datasets::ConvertPolicies()),                                                                  \
-                           framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)))                               \
+                           framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)),                               \
+                           (INPLACE_DATASET)))                                                                            \
     {                                                                                                                     \
         VALIDATE                                                                                                          \
     }
@@ -96,11 +117,16 @@ using NEPixelWiseMultiplicationToU8Fixture = PixelWiseMultiplicationValidationFi
 template <typename T>
 using NEPixelWiseMultiplicationToS16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, int16_t>;
 template <typename T>
+using NEPixelWiseMultiplicationToS32Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, int32_t>;
+template <typename T>
 using NEPixelWiseMultiplicationToF16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, half_float::half>;
 template <typename T>
-using NEPixelWiseMultiplicationToF32Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
+using NEPixelWiseMultiplicationToF32Fixture     = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
+using NEPixelWiseMultiplicationU8U8ToS16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t, int16_t>;
 template <typename T>
-using NEPixelWiseMultiplicationBroadcastFixture = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
+using NEPixelWiseMultiplicationBroadcastFixture              = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, T>;
+using NEPixelWiseMultiplicationBroadcastQASYMM8Fixture       = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t>;
+using NEPixelWiseMultiplicationBroadcastQASYMM8SignedFixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, int8_t, int8_t>;
 
 TEST_SUITE(NEON)
 TEST_SUITE(PixelWiseMultiplication)
@@ -121,6 +147,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),            //11 Mismatching data type
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),            //12 Ok
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),     //13 Quantized cannot do WRAP
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),                //14 S32 does not support scale255
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -135,6 +162,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -142,13 +170,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
                                                      })),
                framework::dataset::make("Scale",{  scale_unity,
                                                    scale_unity,
@@ -162,7 +191,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                    scale_unity,
                                                    scale_unity,
                                                    scale_unity,
-                                                   scale_unity})),
+                                                   scale_unity,
+                                                   scale_255})),
                framework::dataset::make("OverflowPolicy",{
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::WRAP,
@@ -177,9 +207,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::SATURATE,
                                                    ConvertPolicy::WRAP,
+                                                   ConvertPolicy::SATURATE,
                                         })),
 
-               framework::dataset::make("Expected", { true, true, false, false, false, false, false, false, true , false, false, true, false })),
+               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true , false, false, true, false, false})),
                input1_info, input2_info, output_info, scale, policy, expected)
 {
     bool has_error = bool(NEPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, policy, RoundingPolicy::TO_ZERO));
@@ -188,165 +219,198 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+TEST_SUITE(InPlaceValidate)
+TEST_CASE(SingleTensor, framework::DatasetMode::ALL)
+{
+    const auto random_shape       = TensorShape{ 9, 9 };
+    const auto single_tensor_info = TensorInfo{ random_shape, 1, DataType::F32 };
+
+    Status result = NEPixelWiseMultiplication::validate(&single_tensor_info, &single_tensor_info, &single_tensor_info, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(ValidBroadCast, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = NEPixelWiseMultiplication::validate(&larger_tensor_info, &smaller_tensor_info, &larger_tensor_info, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastOutput, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = NEPixelWiseMultiplication::validate(&larger_tensor_info, &smaller_tensor_info, &smaller_tensor_info, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastBoth, framework::DatasetMode::ALL)
+{
+    const auto shape0 = TensorShape{ 9U, 9U };
+    const auto shape1 = TensorShape{ 9U, 1U, 2U };
+
+    const auto info0 = TensorInfo{ shape0, 1, DataType::F32 };
+    const auto info1 = TensorInfo{ shape1, 1, DataType::F32 };
+
+    Status result{};
+
+    result = NEPixelWiseMultiplication::validate(&info0, &info1, &info0, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+
+    result = NEPixelWiseMultiplication::validate(&info0, &info1, &info1, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // InPlaceValidate
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8_SIGNED)
-TEST_SUITE(Scale255)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+TEST_SUITE(ScaleUnity)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
+                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
+                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QASYMM8_SIGNED)),
+                                                                                                                     framework::dataset::make("Scale", { scale_unity })),
+                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                     PixelWiseMultiplicationQASYMM8QuantDataset),
+                                                                                                                     OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallInPlace, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("DataTypeOut", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("Scale", { scale_unity })),
                        PixelWiseMultiplicationPolicySTZDataset),
-                       PixelWiseMultiplicationQASYMM8QuantDataset))
+                       PixelWiseMultiplicationQASYMM8QuantInPlaceDataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() // Scale255
-TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // ScaleUnity
+TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QASYMM8)
 TEST_SUITE(Scale255)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("Scale", { scale_255 })),
-                                                                                                                     PixelWiseMultiplicationPolicySTNUDataset),
-                                                                                                                     PixelWiseMultiplicationQASYMM8QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("Scale", { scale_255 })),
-                                                                                                                   PixelWiseMultiplicationPolicySTNUDataset),
-                                                                                                                   PixelWiseMultiplicationQASYMM8QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("Scale", { scale_255 })),
+                                                                                                                       PixelWiseMultiplicationPolicySTNUDataset),
+                                                                                                                       PixelWiseMultiplicationQASYMM8QuantDataset),
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // Scale255
 TEST_SUITE(ScaleUnity)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("Scale", { scale_unity })),
-                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                     PixelWiseMultiplicationQASYMM8QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("Scale", { scale_unity })),
+                                                                                                                       PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                       PixelWiseMultiplicationQASYMM8QuantDataset),
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("Scale", { scale_unity })),
-                                                                                                                   PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                   PixelWiseMultiplicationQASYMM8QuantDataset))
+TEST_SUITE_END() // ScaleUnity
+TEST_SUITE(ScaleOther)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("Scale", { scale_other })),
+                                                                                                                       PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                       PixelWiseMultiplicationQASYMM8QuantDataset),
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() // ScaleUnity
-TEST_SUITE(ScaleOther)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("Scale", { scale_other })),
-                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                     PixelWiseMultiplicationQASYMM8QuantDataset))
+TEST_SUITE_END() // ScaleOther
+TEST_SUITE(Broadcast)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastQASYMM8Fixture, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                                               framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                               framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                       framework::dataset::make("Scale", { scale_other })),
+                                               PixelWiseMultiplicationPolicySTZDataset),
+                                       PixelWiseMultiplicationQASYMM8QuantDataset),
+                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("Scale", { scale_other })),
-                                                                                                                   PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                   PixelWiseMultiplicationQASYMM8QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunTinyInPlace, NEPixelWiseMultiplicationBroadcastQASYMM8Fixture, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::TinyShapesBroadcastInplace(),
+                                                                               framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                               framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                       framework::dataset::make("Scale", { scale_other })),
+                                               PixelWiseMultiplicationPolicySTZDataset),
+                                       PixelWiseMultiplicationQASYMM8QuantInPlaceDataset),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() // ScaleOther
+TEST_SUITE_END() // Broadcast
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QSYMM16)
 TEST_SUITE(Scale255)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("Scale", { scale_255 })),
-                                                                                                                     PixelWiseMultiplicationPolicySTNUDataset),
-                                                                                                                     PixelWiseMultiplicationQSYMM16QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qsymm16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("Scale", { scale_255 })),
-                                                                                                                   PixelWiseMultiplicationPolicySTNUDataset),
-                                                                                                                   PixelWiseMultiplicationQSYMM16QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("Scale", { scale_255 })),
+                                                                                                                       PixelWiseMultiplicationPolicySTNUDataset),
+                                                                                                                       PixelWiseMultiplicationQSYMM16QuantDataset),
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
 }
 TEST_SUITE_END() // Scale255
 TEST_SUITE(ScaleUnity)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("Scale", { scale_unity })),
-                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                     PixelWiseMultiplicationQSYMM16QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qsymm16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("Scale", { scale_unity })),
-                                                                                                                   PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                   PixelWiseMultiplicationQSYMM16QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("Scale", { scale_unity })),
+                                                                                                                       PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                       PixelWiseMultiplicationQSYMM16QuantDataset),
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
 }
 TEST_SUITE_END() // ScaleUnity
 TEST_SUITE(ScaleOther)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("Scale", { scale_other })),
-                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                     PixelWiseMultiplicationQSYMM16QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qsymm16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("Scale", { scale_other })),
-                                                                                                                   PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                   PixelWiseMultiplicationQSYMM16QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("Scale", { scale_other })),
+                                                                                                                       PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                       PixelWiseMultiplicationQSYMM16QuantDataset),
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -354,13 +418,14 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framew
 TEST_SUITE_END() // ScaleOther
 TEST_SUITE_END() // QSYMM16
 TEST_SUITE(QSYMM16toS32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                       framework::dataset::make("DataTypeOut", DataType::S32)),
-                       framework::dataset::make("Scale", { scale_unity })),
-                       PixelWiseMultiplicationPolicySTZDataset),
-                       PixelWiseMultiplicationQSYMM16QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                    framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                                                    framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                                                    framework::dataset::make("DataTypeOut", DataType::S32)),
+                                                                                                                    framework::dataset::make("Scale", { scale_unity })),
+                                                                                                                    PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                    PixelWiseMultiplicationQSYMM16QuantDataset),
+                                                                                                                    OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -368,21 +433,48 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, f
 TEST_SUITE_END() // QSYMM16toS32
 TEST_SUITE_END() // Quantized
 
+TEST_SUITE(U8U8toS16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::U8)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::U8)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::S16)),
+                                                                                                                       framework::dataset::make("Scale", { scale_255 })),
+                                                                                                                       datasets::ConvertPolicies()),
+                                                                                                                       framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP)),
+                                                                                                                       OutOfPlaceDataSet))
+{
+    // Validate output
+    validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<int16_t>(1), 0.f);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall1, NEPixelWiseMultiplicationU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                        framework::dataset::make("DataTypeIn1", DataType::U8)),
+                                                                                                                        framework::dataset::make("DataTypeIn2", DataType::U8)),
+                                                                                                                        framework::dataset::make("DataTypeOut", DataType::S16)),
+                                                                                                                        framework::dataset::make("Scale", { scale_other })),
+                                                                                                                        datasets::ConvertPolicies()),
+                                                                                                                        framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_ZERO)),
+                                                                                                                        framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE_END() // U8U8toS16
+
 TEST_SUITE(U8toU8)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, U8, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(uint8_t, 1))
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToU8Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, U8, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(uint8_t, 1))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_255, TO_NEAREST_UP, InPlaceDataSet, WRAP_VALIDATE(uint8_t, 1))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, U8, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToU8Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, U8, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, U8, scale_other, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToU8Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, U8, scale_other, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // U8toU8
@@ -390,18 +482,18 @@ TEST_SUITE_END() // U8toU8
 TEST_SUITE(U8toS16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, S16, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(int16_t, 2))
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, S16, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(int16_t, 2))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_255, TO_NEAREST_UP, OutOfPlaceDataSet,
+                                                 WRAP_VALIDATE(int16_t, 2))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, S16, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, S16, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_unity, TO_ZERO, OutOfPlaceDataSet,
+                                                 DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, S16, scale_other, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, S16, scale_other, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_other, TO_ZERO, OutOfPlaceDataSet,
+                                                 DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // U8toS16
@@ -409,27 +501,41 @@ TEST_SUITE_END() // U8toS16
 TEST_SUITE(S16toS16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, PRECOMMIT, SmallShapes(), S16, S16, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(int16_t, 2))
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<int16_t>, NIGHTLY, LargeShapes(), S16, S16, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(int16_t, 2))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_255, TO_NEAREST_UP, InPlaceDataSet, WRAP_VALIDATE(int16_t, 2))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, PRECOMMIT, SmallShapes(), S16, S16, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<int16_t>, NIGHTLY, LargeShapes(), S16, S16, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, PRECOMMIT, SmallShapes(), S16, S16, scale_other, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<int16_t>, NIGHTLY, LargeShapes(), S16, S16, scale_other, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // S16toS16
 
+TEST_SUITE(S32toS32)
+
+TEST_SUITE(ScaleUnity)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS32Fixture<int32_t>, ALL, SmallShapes(), S32, S32, S32, scale_unity, TO_ZERO, InPlaceDataSet, WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // ScaleUnity
+
+TEST_SUITE(ScaleOther)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS32Fixture<int32_t>, ALL, SmallShapes(), S32, S32, S32, scale_other, TO_ZERO, InPlaceDataSet, WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // ScaleOther
+
+TEST_SUITE(Broadcast)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<int32_t>, ALL, SmallShapesBroadcast(), S32, S32, S32, scale_unity, TO_ZERO, framework::dataset::make("InPlace", { false }),
+                                                 WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // Broadcast
+
+TEST_SUITE_END() // S32toS32
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16toF16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF16Fixture<half_float::half>, PRECOMMIT, SmallShapes(), F16, F16, scale_255, TO_NEAREST_UP, VALIDATE(float, 1.f))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF16Fixture<half_float::half>, ALL, SmallShapes(), F16, F16, F16, scale_255, TO_NEAREST_UP, InPlaceDataSet, VALIDATE(float, 1.f))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE_END() // F16toF16
@@ -438,24 +544,22 @@ TEST_SUITE_END() // F16toF16
 TEST_SUITE(F32toF32)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, PRECOMMIT, SmallShapes(), F32, F32, scale_255, TO_NEAREST_UP, VALIDATE(float, 1.f))
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToF32Fixture<float>, NIGHTLY, LargeShapes(), F32, F32, scale_255, TO_NEAREST_UP, VALIDATE(float, 1.f))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, F32, scale_255, TO_NEAREST_UP, InPlaceDataSet, VALIDATE(float, 1.f))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, PRECOMMIT, SmallShapes(), F32, F32, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToF32Fixture<float>, NIGHTLY, LargeShapes(), F32, F32, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, F32, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, PRECOMMIT, SmallShapes(), F32, F32, scale_other, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToF32Fixture<float>, NIGHTLY, LargeShapes(), F32, F32, scale_other, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, F32, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // F32toF32
 
 TEST_SUITE(Broadcast)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<float>, PRECOMMIT, SmallShapesBroadcast(), F32, F32, scale_255, TO_NEAREST_UP, VALIDATE(float, 1.f))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<float>, ALL, SmallShapesBroadcast(), F32, F32, F32, scale_255, TO_NEAREST_UP, framework::dataset::make("InPlace", { false }),
+                                                 VALIDATE(float, 1.f))
 TEST_SUITE_END() // Broadcast
 
 TEST_SUITE_END()
diff --git a/tests/validation/NEON/Pooling3dLayer.cpp b/tests/validation/NEON/Pooling3dLayer.cpp
new file mode 100644
index 0000000000..07054462f5
--- /dev/null
+++ b/tests/validation/NEON/Pooling3dLayer.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/Pooling3dLayerDataset.h"
+#include "tests/datasets/PoolingTypesDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/Pooling3dLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Input data sets for floating-point data types */
+const auto Pooling3dLayerDatasetFP = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 3, 2) })),
+                                                             framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(2, 2, 1) })),
+                                                     framework::dataset::make("Padding", { Padding3D(0, 1, 0), Padding3D(1, 1, 1) })),
+                                             framework::dataset::make("ExcludePadding", { true, false }));
+
+const auto Pooling3dLayerDatasetFPSmall = combine(combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { Size3D(2, 2, 2), Size3D(3, 3, 3) })),
+                                                                  framework::dataset::make("Stride", { Size3D(2, 2, 2), Size3D(2, 1, 1) })),
+                                                          framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                  framework::dataset::make("ExcludePadding", { true, false }));
+
+const auto Pooling3dLayerDatasetQASYMM8Small = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                               framework::dataset::make("PoolingSize", { Size3D(3, 3, 3) })),
+                                                                       framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 1, 1), Size3D(1, 2, 1), Size3D(2, 2, 1) })),
+                                                               framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 1), Padding3D(1, 0, 0) })),
+                                                       framework::dataset::make("ExcludePadding", { true }));
+
+const auto Pooling3dLayerDatasetQASYMM8Large = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                                               framework::dataset::make("PoolingSize", { Size3D(3, 3, 3) })),
+                                                                       framework::dataset::make("Stride", { Size3D(1, 1, 1), Size3D(2, 2, 1) })),
+                                                               framework::dataset::make("Padding", { Padding3D(0, 0, 0), Padding3D(1, 1, 0) })),
+                                                       framework::dataset::make("ExcludePadding", { true }));
+
+using ShapeDataset = framework::dataset::ContainerDataset<std::vector<TensorShape>>;
+
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);     /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+#endif                                                       /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_s(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric type */
+
+const auto qasymm8_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, 10) });
+const auto qasymm8_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, 10), // Same qinfo
+    QuantizationInfo(.1f, 5),  // Multiplier <= 1
+    QuantizationInfo(2.f, 3)   // Multiplier > 1
+});
+
+const auto qasymm8_signed_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, -10) });
+const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, -10), // Same qinfo
+    QuantizationInfo(.1f, -5),  // Multiplier <= 1
+    QuantizationInfo(2.f, -3)   // Multiplier > 1
+});
+
+} //namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(Pooling3dLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Mismatching data type
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid pad/size combination
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid pad/size combination
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid output shape
+                                            TensorInfo(TensorShape(5U, 13U, 15U, 2U, 3U), 1, DataType::F32, DataLayout::NDHWC),     // Global Pooling
+                                            TensorInfo(TensorShape(13U,13U, 5U, 1U, 2U),  1, DataType::F32, DataLayout::NDHWC),     // Invalid output Global Pooling
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NDHWC),     // Invalid data type
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 4U), 1, DataType::F32, DataLayout::NHWC),      // Invalid data layout
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 5U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(1U, 16U,  1U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 13U, 13U, 4U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                          }),
+    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(2U, 25U, 11U, 3U, 3U), 1, DataType::F16, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 30U, 11U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 25U, 16U, 3U, 2U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(2U, 27U, 13U, 3U, 3U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U,  1U,  1U, 1U, 3U), 1, DataType::F32, DataLayout::NDHWC),            // Global pooling applied
+                                            TensorInfo(TensorShape(5U,  2U,  2U, 2U, 2U), 1, DataType::F32, DataLayout::NDHWC),            // Invalid output Global Pooling
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::QASYMM8, DataLayout::NDHWC),        // Invalid data type
+                                            TensorInfo(TensorShape(5U, 12U, 12U, 3U, 4U), 1, DataType::F32, DataLayout::NDHWC),            // Invalid data layout
+                                            TensorInfo(TensorShape(5U,  1U,  1U, 1U, 4U), 1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(1U, 15U, 1U, 2U, 4U), 1, DataType::F32, DataLayout::NDHWC),             // size larger than height
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 2U),  1, DataType::F32, DataLayout::NDHWC),
+                                            TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
+                                    })),
+    framework::dataset::make("PoolInfo",  { Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(2, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::L2,  3, Size3D(1, 1, 1), Padding3D(0, 0, 0)),
+                                            Pooling3dLayerInfo(PoolingType::AVG),
+                                            Pooling3dLayerInfo(PoolingType::MAX),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1U, 1U, 1U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(1U, 1U, 1U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG),
+                                            Pooling3dLayerInfo(PoolingType::MAX, 2, Size3D(1, 1, 2), Padding3D(0, 0, 0), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 2, Size3D(2U, 2U, 2U), Padding3D(), false),
+                                            Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), true),  // pool size is equal to the padding size
+                                            Pooling3dLayerInfo(PoolingType::AVG, 1, Size3D(2U, 2U, 2U), Padding3D(2, 2, 2), false), // pool size is equal to the padding size
+                                            Pooling3dLayerInfo(PoolingType::AVG, 3, Size3D(2U, 2U, 2U), Padding3D(2,1,2,2,1,2), false, false, DimensionRoundingType::CEIL), // CEIL with asymmetric Padding
+                                            })),
+    framework::dataset::make("Expected", { false, false, false, false, true, false, false, false, false, true , false, true, false, false, false})),
+    input_info, output_info, pool_info, expected)
+{
+    bool is_valid = bool(NEPooling3dLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEPoolingLayer3dFixture = Pooling3dLayerValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+template <typename T>
+using NESpecial3dPoolingLayerFixture = SpecialPooling3dLayerValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+template <typename T>
+using NEPooling3dLayerGlobalFixture = Pooling3dLayerGlobalValidationFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunSpecial, NESpecial3dPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::Pooling3dLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5dShapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                            framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFPSmall, framework::dataset::make("DataType", DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunGlobalSmall, NEPooling3dLayerGlobalFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small5x5Shapes(), combine(Pooling3dLayerDatasetFPSmall,
+                                                                                                           framework::dataset::make("DataType", DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large5dShapes(), combine(Pooling3dLayerDatasetFP,
+                                                                                                           framework::dataset::make("DataType",
+                                                                                                                   DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayer3dFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(3U, 27U, 13U, 4U),
+                                                                             TensorShape(4U, 27U, 13U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(27, 13, 4) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", {false, true})),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunSmallGlobal, NEPooling3dLayerGlobalFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 4U, 3U),
+                                                                             TensorShape(27U, 13U, 4U, 4U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayer3dFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(4U, 79U, 37U, 11U),
+                                                                             TensorShape(4U, 79U, 37U, 11U, 2U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size3D(79, 37, 11) })),
+                                    framework::dataset::make("Strides",  Size3D(1, 1, 1))),
+                                    framework::dataset::make("Paddings", Padding3D(0, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+TEST_SUITE_END() // Float
+TEST_SUITE(Quantized)
+
+template <typename T>
+using NEPooling3dLayerQuantizedFixture = Pooling3dLayerValidationQuantizedFixture<Tensor, Accessor, NEPooling3dLayer, T>;
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                       combine(Pooling3dLayerDatasetQASYMM8Small,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       qasymm8_in_qinfo_dataset),
+                                                                                                                       qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPooling3dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large5dShapes(),
+                                                                                                                       combine(Pooling3dLayerDatasetQASYMM8Large,
+                                                                                                                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                       qasymm8_in_qinfo_dataset),
+                                                                                                                       qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPooling3dLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small5dShapes(),
+                                                                                                                      combine(Pooling3dLayerDatasetQASYMM8Small,
+                                                                                                                              framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                      qasymm8_signed_in_qinfo_dataset),
+                                                                                                                      qasymm8_signed_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // Pooling3dLayer
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 4b073d5352..161fe627cc 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/PoolingLayerDataset.h"
@@ -54,7 +53,7 @@ const auto PoolingLayerDatasetFPSmall = combine(combine(combine(datasets::Poolin
 
 /** Input data sets for asymmetric data type */
 
-const auto PoolingLayerDatasetQASYMM8Small = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(3, 7) })),
+const auto PoolingLayerDatasetQASYMM8Small = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(3, 7), Size2D(7, 7) })),
                                                              framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(1, 2, 1, 1) })),
                                                      framework::dataset::make("ExcludePadding", { true }));
 
@@ -66,6 +65,29 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance valu
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_s(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric type */
 const auto                           pool_data_layout_dataset = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
 
+const auto qasymm8_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, 10) });
+const auto qasymm8_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, 10), // Same qinfo
+    QuantizationInfo(.1f, 5),  // Multiplier <= 1
+    QuantizationInfo(2.f, 3)   // Multiplier > 1
+});
+
+const auto qasymm8_signed_in_qinfo_dataset  = framework::dataset::make("InputQuantInfo", { QuantizationInfo(.2f, -10) });
+const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQuantInfo",
+{
+    QuantizationInfo(.2f, -10), // Same qinfo
+    QuantizationInfo(.1f, -5),  // Multiplier <= 1
+    QuantizationInfo(2.f, -3)   // Multiplier > 1
+});
+
+// Cases where pooling region is completely outside the input tensor (excluding global pooling)
+const auto pool_outside_input_dataset = zip(zip(zip(zip(
+                                                        framework::dataset::make("Shape", { TensorShape{ 2U, 2U, 1U }, TensorShape{ 2U, 2U, 4U }, TensorShape{ 3U, 5U, 2U }, TensorShape{ 10U, 20U, 3U } }),
+                                                        framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                                    framework::dataset::make("PoolingSize", { Size2D{ 2, 2 }, Size2D{ 3, 3 }, Size2D{ 2, 2 }, Size2D{ 3, 6 } })),
+                                                framework::dataset::make("PadStride", { PadStrideInfo{ 1, 1, 2, 2 }, PadStrideInfo{ 1, 1, 4, 4 }, PadStrideInfo{ 1, 1, 3, 3 }, PadStrideInfo{ 1, 1, 2, 5 } })),
+                                            framework::dataset::make("ExcludePadding", { false, false, false, false }));
 } // namespace
 
 TEST_SUITE(NEON)
@@ -82,7 +104,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),     // Invalid output Global Pooling
                                             TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::QASYMM8), // Invalid exclude_padding = false with quantized type, no actual padding and NHWC
                                             TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),
-                                          }),
+                                            TensorInfo(TensorShape(1U, 16U, 1U),  1, DataType::F32),
+                                            TensorInfo(TensorShape(112, 112, 64,1), 1, DataType::F32, DataLayout::NHWC), // Mismatching number of channels
+                                            TensorInfo(TensorShape(112, 112, 64,1), 1, DataType::F32, DataLayout::NHWC), // Mismatching width
+                                         }),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(30U, 11U, 2U), 1, DataType::F32),
@@ -91,7 +116,11 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             TensorInfo(TensorShape(2U, 2U, 5U), 1, DataType::F32),
                                             TensorInfo(TensorShape(12U, 12U, 5U), 1, DataType::QASYMM8),
                                             TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
-                                          })),
+                                            TensorInfo(TensorShape(1U, 15U, 1U), 1, DataType::F32),
+                                            TensorInfo(TensorShape(56, 56, 64,1), 1, DataType::F32, DataLayout::NHWC),
+                                            TensorInfo(TensorShape(56, 51, 64,1), 1, DataType::F32, DataLayout::NHWC),
+
+                                           })),
     framework::dataset::make("PoolInfo",  { PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
                                             PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
                                             PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NCHW, PadStrideInfo(1, 1, 2, 0)),
@@ -100,8 +129,12 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             PoolingLayerInfo(PoolingType::MAX, DataLayout::NCHW),
                                             PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NHWC, PadStrideInfo(), false),
                                             PoolingLayerInfo(PoolingType::AVG, DataLayout::NCHW),
+                                            PoolingLayerInfo(PoolingType::MAX, 2, DataLayout::NHWC, PadStrideInfo(1, 1, 0, 0), false),
+                                            PoolingLayerInfo(PoolingType::MAX,3,DataLayout::NHWC,PadStrideInfo(2,2,1,1)),
+                                            PoolingLayerInfo(PoolingType::MAX,3,DataLayout::NHWC,PadStrideInfo(2,2,1,1)),
+
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, true, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, false, true, false, true, false, false, false, false})),
     input_info, output_info, pool_info, expected)
 {
     bool is_valid = bool(NEPoolingLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info));
@@ -115,6 +148,8 @@ using NEPoolingLayerIndicesFixture = PoolingLayerIndicesValidationFixture<Tensor
 
 template <typename T>
 using NEPoolingLayerFixture = PoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T>;
+template <typename T>
+using NEPoolingLayerMixedDataLayoutFixture = PoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T, true>;
 
 template <typename T>
 using NESpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, T>;
@@ -122,27 +157,37 @@ using NESpecialPoolingLayerFixture = SpecialPoolingLayerValidationFixture<Tensor
 const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
                                                                framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0) })),
                                                        framework::dataset::make("ExcludePadding", { true, false }));
-
+const auto PoolingLayerKernelIndicesDatasetFPSmall = combine(combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(7, 7) })),
+                                                                     framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 1, 1, 1) })),
+                                                             framework::dataset::make("ExcludePadding", { false }));
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
-                                                                                                                   framework::dataset::make("DataType",
-                                                                                                                           DataType::F32))),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
-
-                                                                                                                  ))
+FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                   combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                           framework::dataset::make("DataType", DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                   framework::dataset::make("UseKernelIndices", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+    validate(Accessor(_target_indices), _ref_indices);
+}
+FIXTURE_DATA_TEST_CASE(RunKernelIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                   combine(PoolingLayerKernelIndicesDatasetFPSmall,
+                                                                                                                           framework::dataset::make("DataType", DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                   framework::dataset::make("UseKernelIndices", { true })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
     validate(Accessor(_target_indices), _ref_indices);
 }
-
 FIXTURE_DATA_TEST_CASE(RunSpecial, NESpecialPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::PoolingLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerDatasetFPSmall,
                                                                                                                   framework::dataset::make("DataType",
                                                                                                                           DataType::F32))),
                                                                                                           pool_data_layout_dataset))
@@ -150,6 +195,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEPoolingLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(datasets::PoolingTypes(),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 0, 0) })),
+                                       framework::dataset::make("ExcludePadding", { false })),
+                               framework::dataset::make("DataType", DataType::F32))),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
 FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
                                                                                                                 framework::dataset::make("DataType",
                                                                                                                         DataType::F32))),
@@ -158,11 +214,32 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
+TEST_SUITE(CornerCases)
+FIXTURE_DATA_TEST_CASE(PoolRegionCompletelyOutsideInput, NEPoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(pool_outside_input_dataset,
+                       framework::dataset::make("DataType",
+                                                DataType::F32)),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // CornerCases
 TEST_SUITE_END() // FP32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPSmall,
+FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                  combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                          framework::dataset::make("DataType",
+                                                                                                                                  DataType::F16))),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                  framework::dataset::make("UseKernelIndices", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+    validate(Accessor(_target_indices), _ref_indices);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), combine(PoolingLayerDatasetFPSmall,
                                                                                                                  framework::dataset::make("DataType", DataType::F16))),
                                                                                                          pool_data_layout_dataset))
 {
@@ -176,6 +253,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<half>, framework::Dataset
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
+TEST_SUITE(CornerCases)
+FIXTURE_DATA_TEST_CASE(PoolRegionCompletelyOutsideInput, NEPoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(pool_outside_input_dataset,
+                       framework::dataset::make("DataType",
+                                                DataType::F16)),
+                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // CornerCases
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // Float
@@ -184,20 +271,64 @@ TEST_SUITE(Quantized)
 
 template <typename T>
 using NEPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture<Tensor, Accessor, NEPoolingLayer, T>;
+template <typename T>
+using NEPoolingLayerQuantizedMixedDataLayoutFixture = PoolingLayerValidationQuantizedFixture<Tensor, Accessor, NEPoolingLayer, T, true>;
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8Small,
-                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8))),
-                                                                                                                     pool_data_layout_dataset))
+FIXTURE_DATA_TEST_CASE(RunSmallNCHW, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(PoolingLayerDatasetQASYMM8Small,
+                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                       qasymm8_in_qinfo_dataset),
+                       qasymm8_in_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                     combine(PoolingLayerDatasetQASYMM8Small,
+                                                                                                                             framework::dataset::make("DataType", DataType::QASYMM8))),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                     qasymm8_in_qinfo_dataset),
+                                                                                                                     qasymm8_out_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEPoolingLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 255.f, 10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 255.f, 5) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8Small,
-                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
-                                                                                                                    pool_data_layout_dataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                                                                                                                    combine(PoolingLayerDatasetQASYMM8Small,
+                                                                                                                            framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                    qasymm8_signed_in_qinfo_dataset),
+                                                                                                                    qasymm8_signed_in_qinfo_dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEPoolingLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallNoneUnitShapes(),
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_s);
@@ -205,7 +336,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<int8_t>, framewo
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // PoolingLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/PriorBoxLayer.cpp b/tests/validation/NEON/PriorBoxLayer.cpp
index 5659b9f3ae..9a9b9a8e3d 100644
--- a/tests/validation/NEON/PriorBoxLayer.cpp
+++ b/tests/validation/NEON/PriorBoxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,7 +90,7 @@ TEST_SUITE_END() // Float
 TEST_SUITE_END() // FP32
 
 TEST_SUITE_END() // PriorBoxLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/QLSTMLayerNormalization.cpp b/tests/validation/NEON/QLSTMLayerNormalization.cpp
index 3d71175a6f..9738213114 100644
--- a/tests/validation/NEON/QLSTMLayerNormalization.cpp
+++ b/tests/validation/NEON/QLSTMLayerNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -167,7 +167,7 @@ TEST_SUITE(Quantized)
 TEST_SUITE(QSYMM16)
 
 /** Tests will be targetting
- * - Comparison between NEON kernel and the exact same but scalar version of reference kernel
+ * - Comparison between optimized kernel and the exact same but scalar version of reference kernel
  * - Input shapes of 1D and 2D with the first dimension covers boundary values of 128-bit vector size (0~3 iterations)
  * - Weight and bias 1D shape that have same size as that of input shapes
  * - Quantization scale is greater and smaller than one.
@@ -179,7 +179,7 @@ TEST_SUITE(QSYMM16)
  * - The algorithm has been sensitive to quantization scale but it is hard to fully test
  *   the sensitivity due to aforementioned reason.
  * - Again, it is hard to fully test corner values due to the exact same algorithm of the
- *   reference kernel and the NEON kernel.
+ *   reference kernel and the optimized kernel.
  */
 
 constexpr uint32_t qsymm16_per_vector = vector_size_byte / sizeof(int16_t);
@@ -216,7 +216,7 @@ FIXTURE_DATA_TEST_CASE(RandomValue2D, NEQLSTMLayerNormalizationFixture<int16_t>,
 TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // QLSTMLayerNormalization
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp
index a5372b897c..bab7490762 100644
--- a/tests/validation/NEON/QuantizationLayer.cpp
+++ b/tests/validation/NEON/QuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/QuantizationLayerFixture.h"
 
+
 namespace arm_compute
 {
 namespace test
@@ -74,29 +75,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(QuantizationSmallShapes, framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, DataType::QASYMM8);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEQuantizationLayer quant_layer;
-    quant_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 template <typename T>
 using NEQuantizationLayerQASYMM8Fixture = QuantizationValidationFixture<Tensor, Accessor, NEQuantizationLayer, T, uint8_t>;
 template <typename T>
@@ -120,7 +98,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8Signed, NEQuantizationLayerQASYMM8SignedFi
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_u8);
+    validate(Accessor(_target), _reference, tolerance_s8);
 }
 FIXTURE_DATA_TEST_CASE(RunSmallQASYMM16, NEQuantizationLayerQASYMM16Fixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes,
                        framework::dataset::make("DataType", DataType::F32)),
@@ -163,7 +141,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8Signed, NEQuantizationLayerQASYMM8SignedFi
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_u8);
+    validate(Accessor(_target), _reference, tolerance_s8);
 }
 FIXTURE_DATA_TEST_CASE(RunSmallQASYMM16, NEQuantizationLayerQASYMM16Fixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes,
                        framework::dataset::make("DataType", DataType::F16)),
@@ -205,7 +183,16 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8GenFixture<uin
                        framework::dataset::make("DataType", DataType::QASYMM8)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(0.5f, 10) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, 15) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, 15), QuantizationInfo(0.5f, 25) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(ConvertUint8toInt8, NEQuantizationLayerQASYMM8GenFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(QuantizationSmallShapes,
+                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM8_SIGNED })),
+                       framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(2.0f, -1) })),
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, 127) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
@@ -214,7 +201,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8_SIGNED, NEQuantizationLayerQASYMM8_SIGNED
                        framework::dataset::make("DataTypeIn", DataType::QASYMM8)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8_SIGNED })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.0f, 10), QuantizationInfo(2.0f, -25) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 15) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 15), QuantizationInfo(1.0f, 127) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
@@ -234,7 +221,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8_SIGNED, NEQuantizationLayerQASYMM8_SIGNED
                        framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8_SIGNED })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.0f, 10) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, -5) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(2.0f, -5), QuantizationInfo(1.0f, 43) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
@@ -243,16 +230,26 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8GenFixture<int
                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
                        framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(2.0f, 10), QuantizationInfo(2.0f, -25) })),
-                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 30) })))
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, 30), QuantizationInfo(2.0f, -128) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(ConvertInt8toUint8, NEQuantizationLayerQASYMM8_SIGNEDGenFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(QuantizationSmallShapes,
+                       framework::dataset::make("DataTypeIn", DataType::QASYMM8_SIGNED)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
+                       framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.0f, 0) })),
+                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.0f, -128) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_s8);
+}
+
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // QuantizationLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/RNNLayer.cpp b/tests/validation/NEON/RNNLayer.cpp
index a5f84990f2..979aa0f2c5 100644
--- a/tests/validation/NEON/RNNLayer.cpp
+++ b/tests/validation/NEON/RNNLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,8 +39,11 @@ namespace validation
 {
 namespace
 {
-RelativeTolerance<float> tolerance_f32(0.001f);
-RelativeTolerance<half>  tolerance_f16(half(0.1));
+RelativeTolerance<float> tolerance_f32(0.001f); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType:F32 */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+RelativeTolerance<half> tolerance_f16(half(0.1)); /**< Relative tolerance value for comparing reference's output against implementation's output for DataType:F16 */
+constexpr float         abs_tolerance_f16(0.02f); /**< Absolute tolerance value for comparing reference's output against implementation's output for DataType:F16 */
+#endif                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace
 
 TEST_SUITE(NEON)
@@ -136,12 +139,12 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NERNNLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallRNNLayerDataset(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16);
+    validate(Accessor(_target), _reference, tolerance_f16, 0.02f, abs_tolerance_f16);
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 TEST_SUITE_END() // RNNLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ROIAlignLayer.cpp b/tests/validation/NEON/ROIAlignLayer.cpp
index b5deb01664..98c92a0b20 100644
--- a/tests/validation/NEON/ROIAlignLayer.cpp
+++ b/tests/validation/NEON/ROIAlignLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,6 +53,7 @@ AbsoluteTolerance<float> absolute_tolerance_f16(0.001f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+constexpr AbsoluteTolerance<int8_t> tolerance_qasymm8_s(1);
 } // namespace
 
 TEST_SUITE(NEON)
@@ -129,10 +130,10 @@ FIXTURE_DATA_TEST_CASE(SmallROIAlignLayerHalf, NEROIAlignLayerHalfFixture, frame
 TEST_SUITE_END() // Float
 
 TEST_SUITE(Quantized)
-TEST_SUITE(QASYMM8)
 template <typename T>
 using NEROIAlignLayerQuantizedFixture = ROIAlignLayerQuantizedFixture<Tensor, Accessor, NEROIAlignLayer, T, uint16_t>;
 
+TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(Small, NEROIAlignLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(datasets::SmallROIDataset(),
                                                        framework::dataset::make("DataType", { DataType::QASYMM8 })),
@@ -144,10 +145,23 @@ FIXTURE_DATA_TEST_CASE(Small, NEROIAlignLayerQuantizedFixture<uint8_t>, framewor
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(Small, NEROIAlignLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(datasets::SmallROIDataset(),
+                                                       framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 127) })),
+                               framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(2.f / 255.f, 120) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_s);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // RoiAlign
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ROIPoolingLayer.cpp b/tests/validation/NEON/ROIPoolingLayer.cpp
new file mode 100644
index 0000000000..8b5147e57f
--- /dev/null
+++ b/tests/validation/NEON/ROIPoolingLayer.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/Globals.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ROIDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ROIPoolingLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> relative_tolerance_f32(0.01f);
+AbsoluteTolerance<float> absolute_tolerance_f32(0.001f);
+
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+} // end namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(RoiPooling)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Successful test
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::QASYMM8), // Successful test (quantized)
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Incorrect rois type
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching data type input/output
+                                                       TensorInfo(TensorShape(250U, 128U, 2U), 1, DataType::F32), // Mismatching depth size input/output
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching number of rois and output batch size
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Invalid number of values per ROIS
+                                                       TensorInfo(TensorShape(250U, 128U, 3U), 1, DataType::F32), // Mismatching height and width input/output
+
+                                                     }),
+               framework::dataset::make("RoisInfo", { TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::F16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 10U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(4, 4U), 1, DataType::U16),
+                                                      TensorInfo(TensorShape(5, 4U), 1, DataType::U16),
+                                                    })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(7U, 7U, 3U, 4U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(5U, 5U, 3U, 4U), 1, DataType::F32),
+                                                     })),
+               framework::dataset::make("PoolInfo", { ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      ROIPoolingLayerInfo(7U, 7U, 1./8),
+                                                      })),
+               framework::dataset::make("Expected", { true, true, false, false, false, false, false })),
+               input_info, rois_info, output_info, pool_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(NEROIPoolingLayer::validate(&input_info.clone()->set_is_resizable(true), &rois_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true), pool_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using NEROIPoolingLayerFloatFixture = ROIPoolingLayerFixture<Tensor, Accessor, NEROIPoolingLayer, float>;
+
+TEST_SUITE(Float)
+FIXTURE_DATA_TEST_CASE(SmallROIPoolingLayerFloat, NEROIPoolingLayerFloatFixture, framework::DatasetMode::ALL,
+                       framework::dataset::combine(framework::dataset::combine(datasets::SmallROIDataset(),
+                                                                               framework::dataset::make("DataType", { DataType::F32 })),
+                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, relative_tolerance_f32, .02f, absolute_tolerance_f32);
+}
+
+TEST_SUITE_END() // Float test suite end
+
+// Begin quantized tests
+TEST_SUITE(Quantized)
+template <typename T>
+using NEROIPoolingLayerQuantizedFixture = ROIPoolingLayerQuantizedFixture<Tensor, Accessor, NEROIPoolingLayer, T>;
+
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(Small, NEROIPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(datasets::SmallROIDataset(),
+                                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 127) })),
+                               framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(2.f / 255.f, 120) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE_END() // end qasymm8 tests
+TEST_SUITE_END() // end quantized tests
+
+TEST_SUITE_END() // RoiPooling
+TEST_SUITE_END() // NEON
+
+} // validation end
+} // test namespace end
+} // arm_compute namespace end
diff --git a/tests/validation/NEON/Range.cpp b/tests/validation/NEON/Range.cpp
index 06351c8f66..fda7b2c448 100644
--- a/tests/validation/NEON/Range.cpp
+++ b/tests/validation/NEON/Range.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -172,7 +172,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // Range
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp
index 821171a603..8ca0bb53a7 100644
--- a/tests/validation/NEON/ReduceMean.cpp
+++ b/tests/validation/NEON/ReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,10 +46,15 @@ constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value f
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 constexpr AbsoluteTolerance<float> tolerance_f16(0.03f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
 #endif                                                   // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef __aarch64__
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
 constexpr AbsoluteTolerance<int8_t>  tolerance_s8(1);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#else // __aarch64__
+constexpr AbsoluteTolerance<uint8_t> tolerance_u8(2);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
+constexpr AbsoluteTolerance<int8_t>  tolerance_s8(2);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#endif // __aarch64__
 
-const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(0, 1, 2, 3) }),
+const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(2, 3), Coordinates(0, 1, 2, 3) }),
                                framework::dataset::make("KeepDims", { true }));
 const auto axis_drop = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1), Coordinates(3) }), framework::dataset::make("KeepDims", { false }));
 } // namespace
@@ -83,28 +88,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::F32 })),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst;
-
-    Coordinates axis(1);
-
-    // Create and Configure function
-    NEReduceMean reduce_mean;
-    reduce_mean.configure(&ref_src, axis, true, &dst);
-
-    // Validate valid region
-    TensorShape output_shape = shape;
-    output_shape.set(1, 1);
-    const ValidRegion valid_region = shape_to_valid_region(output_shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEReduceMeanFixture = ReduceMeanFixture<Tensor, Accessor, NEReduceMean, T>;
 
@@ -160,16 +143,33 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReduceMeanQuantizedFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 5) })))
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 255, 5) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 255, 5) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u8);
+}
+
+TEST_SUITE(Requant)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEReduceMeanQuantizedFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), axis_drop),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 255, 5) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 200, 16) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
 }
+TEST_SUITE_END() // Requant
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReduceMeanQuantizedFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 5) })))
+                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 255, 5) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 255, 5) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
@@ -180,15 +180,32 @@ TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReduceMeanQuantizedFixture<int8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 127, -10), QuantizationInfo(1.f / 250, -20) })))
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 127, -10), QuantizationInfo(1.f / 250, -20) })),
+                               framework::dataset::make("QuantizationInfoInputOutput", { QuantizationInfo(1.f / 127, -10) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
 }
+TEST_SUITE(Requant)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEReduceMeanQuantizedFixture<int8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), axis_drop),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 102, 2) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 113, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_s8);
+}
+TEST_SUITE_END() // Requant
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReduceMeanQuantizedFixture<int8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 127, 0) })))
+                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 127, -10) })),
+                               framework::dataset::make("QuantizationInfoInputOutput", { QuantizationInfo(1.f / 127, -10) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
@@ -196,7 +213,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // ReduceMean
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ReductionOperation.cpp b/tests/validation/NEON/ReductionOperation.cpp
index f155e97d0d..48c3a1a788 100644
--- a/tests/validation/NEON/ReductionOperation.cpp
+++ b/tests/validation/NEON/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@ namespace
 AbsoluteTolerance<float> tolerance_f32(0.0001f);
 RelativeTolerance<float> rel_tolerance_f32(0.0001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-AbsoluteTolerance<float> tolerance_f16(0.1f);
+AbsoluteTolerance<float> tolerance_f16(0.2f);
 RelativeTolerance<float> rel_tolerance_f16(0.1f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Tolerance for quantized operations */
@@ -106,6 +106,28 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                         keep_dims));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis",
+{ 0, 1 })), framework::dataset::make("ReductionOperation", {ReductionOperation::SUM,})), KeepDims),
+               shape, data_type, axis, op, keep_dims)
+{
+    TensorShape         input_shape = TensorShape(shape);
+    TensorInfo input_info   = TensorInfo(input_shape, 1, data_type);
+    const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+    const bool _keep_dims = keep_dims && !is_arg_min_max;
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(shape, axis, keep_dims);
+
+    // Create tensors
+    Tensor src     = create_tensor<Tensor>(input_shape, data_type, 1, QuantizationInfo());
+    Tensor dst     = create_tensor<Tensor>(output_shape, data_type, 1, QuantizationInfo());
+
+    // Create and configure function
+    NEReductionOperation reduction;
+    reduction.configure(&src, &dst, axis, op, _keep_dims);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
 // clang-format on
 // *INDENT-ON*
 
@@ -172,7 +194,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationQuantizedFixture<int8_t>, f
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE_END() // ReductionOperation
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Remap.cpp b/tests/validation/NEON/Remap.cpp
deleted file mode 100644
index 2e54b1152e..0000000000
--- a/tests/validation/NEON/Remap.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/RemapFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(0);
-constexpr float                      tolerance_number = 0.f;
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(Remap)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                           framework::dataset::make("DataType", DataType::U8)),
-                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })),
-               shape, policy, data_type, border_mode)
-{
-    Tensor src   = create_tensor<Tensor>(shape, data_type);
-    Tensor map_x = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor map_y = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor dst   = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(map_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(map_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NERemap remap;
-    remap.configure(&src, &map_x, &map_y, &dst, policy, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    const int total_right  = ceil_to_multiple(shape[0], 16);
-    const int access_right = total_right + (((total_right - shape[0]) == 0) ? 1 : 0);
-
-    const PaddingSize read_padding(1, access_right - shape[0], 1, 1);
-    validate(src.info()->padding(), read_padding);
-
-    PaddingCalculator calculator(shape.x(), 16);
-    validate(dst.info()->padding(), calculator.required_padding());
-}
-
-template <typename T>
-using NERemapFixture = RemapValidationFixture<Tensor, Accessor, NERemap, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NERemapFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                             framework::dataset::make("DataType",
-                                                                                                                     DataType::U8)),
-                                                                                                     framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NERemapFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/ReorderLayer.cpp b/tests/validation/NEON/ReorderLayer.cpp
new file mode 100644
index 0000000000..839ad0ac92
--- /dev/null
+++ b/tests/validation/NEON/ReorderLayer.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__)
+
+#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ReorderLayerDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ReorderFixture.h"
+#include "src/core/NEON/kernels/NEReorderKernel.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using framework::dataset::make;
+
+TEST_SUITE(NEON)
+TEST_SUITE(ReorderLayer)
+
+template <typename T>
+using NEReorderLayerAlias = ReorderValidationFixture<Tensor, Accessor, NEReorderLayer, T>;
+
+TEST_SUITE(FP32)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+DATA_TEST_CASE(ValidateReorderOHWIo8, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("InShape",{ TensorShape(10U, 9U), TensorShape(234U, 301U) }),
+                                                                     make("OutShape", { TensorShape(10U, 16U), TensorShape(234U, 304U) })
+                                                                    ),
+                                                                    zip(
+                                                                        make("InputWeightFormat", {WeightFormat::OHWI}),
+                                                                        make("OutputWeightFormat", {WeightFormat::OHWIo8})
+                                                                    )),
+            input_shape, output_shape,  input_wf,  output_wf)
+{
+    if(Scheduler::get().cpu_info().has_sve()){
+        arm_compute::NEReorderLayer reorder_layer;
+        int vector_length = arm_gemm::utils::get_vector_length<float>();
+        bool expected_bool_status = false;
+        if (vector_length == 8)
+        {
+            expected_bool_status = true;
+        }
+
+        TensorInfo input_tensor_info(input_shape, 1, DataType::F32);
+        TensorInfo output_tensor_info(output_shape, 1, DataType::F32);
+
+        Status status = reorder_layer.validate(&input_tensor_info, &output_tensor_info, input_wf, output_wf);
+
+        ARM_COMPUTE_EXPECT((expected_bool_status == bool(status)), framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), make("DataType", DataType::F32)))
+{
+    // Validate output
+    if (_hardware_supports)
+    {
+        validate(Accessor(_target), _reference);
+    }
+}
+#endif // ARM_COMPUTE_ENABLE_SVE
+
+FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE_END() // FP32
+
+TEST_SUITE_END() // ReorderLayer
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif  // defined(__aarch64__)
diff --git a/tests/validation/NEON/ReorgLayer.cpp b/tests/validation/NEON/ReorgLayer.cpp
index 5a76315d77..38bef51ed3 100644
--- a/tests/validation/NEON/ReorgLayer.cpp
+++ b/tests/validation/NEON/ReorgLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,44 +69,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallReorgLayerDataset(), datasets::LargeReorgLayerDataset()),
-                                                                           framework::dataset::make("DataType", { DataType::F32, DataType::F16, DataType::QASYMM8 })),
-                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-               shape, stride, data_type, data_layout)
-{
-    // Permute the tensor shape in case of NHWC data layout
-    TensorShape shape_to_use = shape;
-    if(data_layout == DataLayout::NHWC)
-    {
-        permute(shape_to_use, PermutationVector(2U, 0U, 1U));
-    }
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape_to_use, data_type, 1, QuantizationInfo(), data_layout);
-    Tensor dst;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEReorgLayer reorg_layer;
-
-    // Auto-initialize the output within the function
-    reorg_layer.configure(&src, &dst, stride);
-
-    // Validate valid region
-    const ValidRegion src_valid_region = shape_to_valid_region(shape_to_use);
-    const ValidRegion dst_valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(src.info()->valid_region(), src_valid_region);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    const int         step        = 1;
-    const PaddingSize src_padding = PaddingCalculator(shape_to_use.x(), step).required_padding();
-    const PaddingSize dst_padding = PaddingCalculator(dst.info()->tensor_shape().x(), step).required_padding();
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEReorgLayerFixture = ReorgLayerValidationFixture<Tensor, Accessor, NEReorgLayer, T>;
 
@@ -164,7 +126,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEReorgLayerFixture<int8_t>, framework::Dataset
 TEST_SUITE_END() // S8
 
 TEST_SUITE_END() // ReorgLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ReshapeLayer.cpp b/tests/validation/NEON/ReshapeLayer.cpp
index 3d1e0e618b..e9f114d491 100644
--- a/tests/validation/NEON/ReshapeLayer.cpp
+++ b/tests/validation/NEON/ReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,9 @@ input_info, output_info, expected)
 template <typename T>
 using NEReshapeLayerFixture = ReshapeLayerValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
 
+template <typename T>
+using NEReshapeLayerPaddedFixture = ReshapeLayerPaddedValidationFixture<Tensor, Accessor, NEReshapeLayer, T>;
+
 TEST_SUITE(Float)
 TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
@@ -84,8 +87,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<float>, framework::Datase
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //F32
+TEST_SUITE_END() //Float
 
 TEST_SUITE(Integer)
 TEST_SUITE(S8)
@@ -94,7 +97,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int8_t>, framework::Datas
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() //S8
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
@@ -102,11 +105,41 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerFixture<int16_t>, framework::Data
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
+
+TEST_SUITE(Padded)
+TEST_SUITE(Float)
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S32
+TEST_SUITE_END() //Float
+
+TEST_SUITE(Integer)
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int8_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S8
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEReshapeLayerPaddedFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType", DataType::S16)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() //S16
+TEST_SUITE_END() //Integer
+TEST_SUITE_END() //Padded
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() //ReshapeLayer
+TEST_SUITE_END() //NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Reverse.cpp b/tests/validation/NEON/Reverse.cpp
index 2f3f69aee9..7b5337f14b 100644
--- a/tests/validation/NEON/Reverse.cpp
+++ b/tests/validation/NEON/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,8 @@ namespace validation
 {
 namespace
 {
-auto run_small_dataset = combine(datasets::SmallShapes(), datasets::Tiny1DShapes());
+using framework::dataset::make;
+auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes());
 auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes());
 
 } // namespace
@@ -53,28 +54,31 @@ TEST_SUITE(Reverse)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
+        make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8), // Invalid axis datatype
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis shape
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid axis length (> 4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Mismatching shapes
+                                            TensorInfo(TensorShape(32U, 13U, 17U, 3U, 2U), 1, DataType::U8), // Unsupported source dimensions (>4)
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         }),
-        framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
+        make("OutputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U, 13U, 2U), 1, DataType::U8),
+                                            TensorInfo(TensorShape(32U, 13U, 17U, 3U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                             TensorInfo(TensorShape(2U), 1, DataType::U8),
         })),
-        framework::dataset::make("AxisInfo", { TensorInfo(TensorShape(3U), 1, DataType::U8),
+        make("AxisInfo", { TensorInfo(TensorShape(3U), 1, DataType::U8),
                                            TensorInfo(TensorShape(2U, 10U), 1, DataType::U32),
                                            TensorInfo(TensorShape(8U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
                                            TensorInfo(TensorShape(2U), 1, DataType::U32),
+                                           TensorInfo(TensorShape(2U), 1, DataType::U32),
         })),
-        framework::dataset::make("Expected", { false, false, false, false, true, true})),
+        make("Expected", { false, false, false, false, false, true, true})),
         src_info, dst_info, axis_info, expected)
 {
     Status s = NEReverse::validate(&src_info.clone()->set_is_resizable(false),
@@ -95,7 +99,11 @@ TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -104,7 +112,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<half>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F16)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F16),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -116,7 +128,11 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -125,7 +141,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<float>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::F32)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::F32),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -138,7 +158,11 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -147,7 +171,11 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReverseFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(run_large_dataset, framework::dataset::make("DataType", DataType::QASYMM8)))
+                       combine(
+                           run_large_dataset,
+                           make("DataType", DataType::QASYMM8),
+                           make("use_negative_axis", { true, false }),
+                           make("use_inverted_axis", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -156,7 +184,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // Reverse
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index 9ab169b251..f1209a21ac 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,20 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/InterpolationPolicyDataset.h"
-#include "tests/datasets/SamplingPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
+#include "tests/datasets/ScaleValidationDataset.h"
 #include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ScaleFixture.h"
 
@@ -47,95 +37,15 @@ namespace validation
 {
 namespace
 {
-using test::datasets::ShapeDataset;
-
-/** Class to generate boundary values for the given template parameters
- * including shapes with large differences between width and height
- */
-template <uint32_t channel, uint32_t batch, uint32_t element_per_vector, uint32_t vector_size>
-class ScaleShapesBaseDataSet : public ShapeDataset
-{
-    static constexpr auto boundary_minus_one = element_per_vector * vector_size - 1;
-    static constexpr auto boundary_plus_one  = element_per_vector * vector_size + 1;
-    static constexpr auto small_size         = 3;
-
-public:
-    // These tensor shapes are NCHW layout, fixture will convert to NHWC.
-    ScaleShapesBaseDataSet()
-        : ShapeDataset("Shape",
-    {
-        TensorShape{ small_size, boundary_minus_one, channel, batch },
-                     TensorShape{ small_size, boundary_plus_one, channel, batch },
-                     TensorShape{ boundary_minus_one, small_size, channel, batch },
-                     TensorShape{ boundary_plus_one, small_size, channel, batch },
-                     TensorShape{ boundary_minus_one, boundary_plus_one, channel, batch },
-                     TensorShape{ boundary_plus_one, boundary_minus_one, channel, batch },
-    })
-    {
-    }
-};
-
-/** For the single vector, only larger value (+1) than boundary
- * since smaller value (-1) could cause some invalid shapes like
- * - invalid zero size
- * - size 1 which isn't compatible with scale with aligned corners.
- */
-template <uint32_t channel, uint32_t batch, uint32_t element_per_vector>
-class ScaleShapesBaseDataSet<channel, batch, element_per_vector, 1> : public ShapeDataset
-{
-    static constexpr auto small_size        = 3;
-    static constexpr auto boundary_plus_one = element_per_vector + 1;
-
-public:
-    // These tensor shapes are NCHW layout, fixture will convert to NHWC.
-    ScaleShapesBaseDataSet()
-        : ShapeDataset("Shape",
-    {
-        TensorShape{ small_size, boundary_plus_one, channel, batch },
-                     TensorShape{ boundary_plus_one, small_size, channel, batch },
-    })
-    {
-    }
-};
-
-/** For the shapes smaller than one vector, only pre-defined tiny shapes
- * are tested (3x2, 2x3) as smaller shapes are more likely to cause
- * issues and easier to debug.
- */
-template <uint32_t channel, uint32_t batch, uint32_t element_per_vector>
-class ScaleShapesBaseDataSet<channel, batch, element_per_vector, 0> : public ShapeDataset
-{
-    static constexpr auto small_size                 = 3;
-    static constexpr auto zero_vector_boundary_value = 2;
-
-public:
-    // These tensor shapes are NCHW layout, fixture will convert to NHWC.
-    ScaleShapesBaseDataSet()
-        : ShapeDataset("Shape",
-    {
-        TensorShape{ small_size, zero_vector_boundary_value, channel, batch },
-                     TensorShape{ zero_vector_boundary_value, small_size, channel, batch },
-    })
-    {
-    }
-};
-
-/** Generated shaeps
- * - 2D shapes with 0, 1, 2 vector iterations
- * - 3D shapes with 0, 1 vector iterations
- * - 4D shapes with 0 vector iterations
- */
-#define SCALE_SHAPE_DATASET(element_per_vector)                                                  \
-    concat(concat(concat(concat(concat(ScaleShapesBaseDataSet<1, 1, (element_per_vector), 0>(),  \
-                                       ScaleShapesBaseDataSet<1, 1, (element_per_vector), 1>()), \
-                                ScaleShapesBaseDataSet<1, 1, (element_per_vector), 2>()),        \
-                         ScaleShapesBaseDataSet<3, 3, (element_per_vector), 0>()),               \
-                  ScaleShapesBaseDataSet<3, 3, (element_per_vector), 1>()),                      \
-           ScaleShapesBaseDataSet<3, 7, (element_per_vector), 0>())
+using datasets::ScaleShapesBaseDataSet;
+using datasets::ScaleInterpolationPolicySet;
+using datasets::ScaleDataLayouts;
+using datasets::ScaleSamplingPolicySet;
+using datasets::ScaleAlignCornersSamplingPolicySet;
 
 /** We consider vector size in byte 64 since the maximum size of
- * a vector used by @ref ScaleKernelInfo is currently 64-byte (float32x4x4).
- * There are possibility to reduce test time further by using
+ * a vector used by the kernel is currently 64-byte (float32x4x4).
+ * There is possibility to reduce test time further by using
  * smaller vector sizes for different data types where applicable.
  */
 constexpr uint32_t vector_byte = 64;
@@ -146,71 +56,31 @@ constexpr uint32_t num_elements_per_vector()
     return vector_byte / sizeof(T);
 }
 
-/** Scale data types */
-const auto ScaleDataTypes = framework::dataset::make("DataType",
-{
-    DataType::U8,
-    DataType::S16,
-    DataType::F32,
-});
-
-/** Interpolation policy test set */
-const auto InterpolationPolicySet = framework::dataset::make("InterpolationPolicy",
-{
-    InterpolationPolicy::NEAREST_NEIGHBOR,
-    InterpolationPolicy::BILINEAR,
-});
-
-/** Scale data types */
-const auto ScaleDataLayouts = framework::dataset::make("DataLayout",
-{
-    DataLayout::NCHW,
-    DataLayout::NHWC,
-});
-
-/** Sampling policy data set */
-const auto SamplingPolicySet = framework::dataset::make("SamplingPolicy",
+/** Quantization information data set */
+const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
 {
-    SamplingPolicy::TOP_LEFT,
-    SamplingPolicy::CENTER,
+    QuantizationInfo(0.5f, -10),
 });
 
-/** Align corners */
-const auto AlignCorners = framework::dataset::make("AlignCorners",
+/** Quantization information data set */
+const auto InputQuantizationInfoSet = framework::dataset::make("InputQuantizationInfo",
 {
-    false,
-    true,
+    QuantizationInfo(0.5f, -10),
 });
 
-/** Generating dataset for non-quantized data tyeps with the given shapes */
-#define ASSEMBLE_DATASET(shape)                                         \
-    combine(combine(combine(combine(combine((shape), ScaleDataLayouts), \
-                                    InterpolationPolicySet),            \
-                            datasets::BorderModes()),                   \
-                    SamplingPolicySet),                                 \
-            AlignCorners)
-
 /** Quantization information data set */
-const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
+const auto OutputQuantizationInfoSet = framework::dataset::make("OutputQuantizationInfo",
 {
-    QuantizationInfo(0.5f, -10),
+    QuantizationInfo(0.2f, 20),
 });
 
-/** Generating dataset for quantized data tyeps with the given shapes */
-#define ASSEMBLE_QUANTIZED_DATASET(shape)                                 \
-    combine(combine(combine(combine(combine(combine(shape,                \
-                                                    QuantizationInfoSet), \
-                                            ScaleDataLayouts),            \
-                                    InterpolationPolicySet),              \
-                            datasets::BorderModes()),                     \
-                    SamplingPolicySet),                                   \
-            AlignCorners)
-
 /** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
+constexpr AbsoluteTolerance<int8_t>  tolerance_s8(1);
 constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
-RelativeTolerance<float>             tolerance_f32(0.01);
+RelativeTolerance<float>             tolerance_f32(0.05);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+constexpr float         abs_tolerance_f16(0.01f);
 RelativeTolerance<half> tolerance_f16(half(0.1));
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
@@ -224,9 +94,8 @@ TEST_SUITE(Validate)
 
 /** Validate test suite is to test ARM_COMPUTE_RETURN_ON_* macros
  * we use to check the validity of given arguments in @ref NEScale
- * and subsequent call to @ref NEScaleKernel.
  * Since this is using validate() of @ref NEScale, which pre-adjust
- * arguments for @ref NEScaleKernel, the following conditions in
+ * arguments for the kernel, the following conditions in
  * the kernel are not currently tested.
  * - The same input and output
  * - Data type of offset, dx and dy
@@ -247,9 +116,8 @@ const auto output_shape = TensorShape{ 4, 6, 3, 2 };
 constexpr auto default_data_type            = DataType::U8;
 constexpr auto default_data_layout          = DataLayout::NHWC;
 constexpr auto default_interpolation_policy = InterpolationPolicy::NEAREST_NEIGHBOR;
-constexpr auto default_border_mode          = BorderMode::UNDEFINED;
+constexpr auto default_border_mode          = BorderMode::CONSTANT;
 constexpr auto default_sampling_policy      = SamplingPolicy::CENTER;
-constexpr bool default_use_padding          = false;
 
 TEST_CASE(NullPtr, framework::DatasetMode::ALL)
 {
@@ -258,17 +126,11 @@ TEST_CASE(NullPtr, framework::DatasetMode::ALL)
     Status     result{};
 
     // nullptr is given as input
-    result = NEScale::validate(nullptr, &output, default_interpolation_policy, default_border_mode);
-    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
-
-    result = NEScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    result = NEScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 
     // nullptr is given as output
-    result = NEScale::validate(&input, nullptr, default_interpolation_policy, default_border_mode);
-    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
-
-    result = NEScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -293,8 +155,6 @@ TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
         { DataType::BFLOAT16, false },
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         { DataType::F16, true },
-#else  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { DataType::F16, false },
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         { DataType::F32, true },
         { DataType::F64, false },
@@ -306,10 +166,7 @@ TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
         const auto input  = TensorInfo{ input_shape, 1, kv.first, default_data_layout };
         const auto output = TensorInfo{ output_shape, 1, kv.first, default_data_layout };
 
-        result = NEScale::validate(&input, &output, default_interpolation_policy, default_border_mode);
-        ARM_COMPUTE_EXPECT(bool(result) == kv.second, framework::LogLevel::ERRORS);
-
-        result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+        result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
         ARM_COMPUTE_EXPECT(bool(result) == kv.second, framework::LogLevel::ERRORS);
     }
 }
@@ -322,10 +179,7 @@ TEST_CASE(MissmatchingDataType, framework::DatasetMode::ALL)
     const auto output = TensorInfo{ output_shape, 1, non_default_data_type, default_data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, default_interpolation_policy, default_border_mode);
-    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
-
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -335,12 +189,9 @@ TEST_CASE(UsePadding, framework::DatasetMode::ALL)
     const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout };
     Status     result{};
 
-    // When use padding is false, border mode should be constant
-    constexpr auto border_mode = BorderMode::UNDEFINED;
-    constexpr bool use_padding = false;
-
-    result = NEScale::validate(&input, &output, default_interpolation_policy, border_mode, PixelValue(), default_sampling_policy, use_padding);
-    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+    // Padding is not supported anymore
+    constexpr auto border_mode = BorderMode::CONSTANT;
+    constexpr bool use_padding = true;
 
     result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, border_mode, PixelValue(), default_sampling_policy, use_padding });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
@@ -356,10 +207,7 @@ TEST_CASE(AreaWithNHWC, framework::DatasetMode::ALL)
     const auto output = TensorInfo{ output_shape, 1, default_data_type, data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, interpolation_policy, default_border_mode);
-    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
-
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -374,43 +222,156 @@ TEST_CASE(AreaWithNonU8, framework::DatasetMode::ALL)
     const auto output = TensorInfo{ output_shape, 1, data_type, data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, interpolation_policy, default_border_mode);
-    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
-
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
-TEST_CASE(InvalidAlignedCornerOutput, framework::DatasetMode::ALL)
+TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL)
 {
-    // Bilinear with aligned corners require at least 2x2 output to prevent overflow.
-    // Also, aligned corners require sampling policy to be TOP_LEFT.
+    // Aligned corners require sampling policy to be TOP_LEFT.
     constexpr auto interpolation_policy = InterpolationPolicy::BILINEAR;
     constexpr bool align_corners        = true;
-    constexpr auto sampling_policy      = SamplingPolicy::TOP_LEFT;
-    const auto     invalid_output_shape = TensorShape{ 1, 1, 3, 2 };
+    constexpr auto sampling_policy      = SamplingPolicy::CENTER;
 
     const auto input  = TensorInfo{ input_shape, 1, default_data_type, default_data_layout };
-    const auto output = TensorInfo{ invalid_output_shape, 1, default_data_type, default_data_layout };
+    const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, interpolation_policy, default_border_mode, PixelValue(), sampling_policy, default_use_padding, align_corners);
-    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
-
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, default_use_padding, align_corners });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false, align_corners });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 TEST_SUITE_END() // Validate
 
+DATA_TEST_CASE(CheckNoPadding, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Medium4DShapes(),
+                                                                                            framework::dataset::make("DataType", { DataType::F32, DataType::QASYMM8 })),
+                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::BILINEAR, InterpolationPolicy::NEAREST_NEIGHBOR })),
+                                                                            framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER, SamplingPolicy::TOP_LEFT })),
+                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+               shape, data_type, interpolation_policy, sampling_policy, data_layout)
+{
+    constexpr auto  default_border_mode = BorderMode::CONSTANT;
+    ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false);
+
+    // Create tensors
+    Tensor src = create_tensor<Tensor>(shape, data_type);
+    src.info()->set_data_layout(data_layout);
+
+    const float scale_x = 0.5f;
+    const float scale_y = 0.5f;
+    TensorShape shape_scaled(shape);
+    const int   idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int   idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    shape_scaled.set(idx_width, shape[idx_width] * scale_x, /* apply_dim_correction = */ false);
+    shape_scaled.set(idx_height, shape[idx_height] * scale_y, /* apply_dim_correction = */ false);
+    Tensor dst = create_tensor<Tensor>(shape_scaled, data_type);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NEScale scale;
+    scale.configure(&src, &dst, info);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
+
+DATA_TEST_CASE(CheckNoPaddingInterpAREA, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Medium4DShapes(),
+                                                                                                      framework::dataset::make("DataType", { DataType::U8 })),
+                                                                                              framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::AREA })),
+                                                                                      framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER, SamplingPolicy::TOP_LEFT })),
+                                                                              framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+               shape, data_type, interpolation_policy, sampling_policy, data_layout)
+{
+    constexpr auto  default_border_mode = BorderMode::CONSTANT;
+    ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false);
+
+    // Create tensors
+    Tensor src = create_tensor<Tensor>(shape, data_type);
+    src.info()->set_data_layout(data_layout);
+
+    const float scale_x = 0.5f;
+    const float scale_y = 0.5f;
+    TensorShape shape_scaled(shape);
+    const int   idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int   idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    shape_scaled.set(idx_width, shape[idx_width] * scale_x, /* apply_dim_correction = */ false);
+    shape_scaled.set(idx_height, shape[idx_height] * scale_y, /* apply_dim_correction = */ false);
+
+    Tensor dst = create_tensor<Tensor>(shape, data_type);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NEScale scale;
+    scale.configure(&src, &dst, info);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
+
 template <typename T>
 using NEScaleFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T>;
 template <typename T>
+using NEScaleMixedDataLayoutFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T, true>;
+template <typename T>
 using NEScaleQuantizedFixture = ScaleValidationQuantizedFixture<Tensor, Accessor, NEScale, T>;
+template <typename T>
+using NEScaleDifferentOutputQuantizedFixture = ScaleValidationDifferentOutputQuantizedFixture<Tensor, Accessor, NEScale, T>;
+template <typename T>
+using NEScaleQuantizedMixedDataLayoutFixture = ScaleValidationQuantizedFixture<Tensor, Accessor, NEScale, T, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-const auto f32_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape))
+const auto f32_shape      = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+const auto f32_shape_nhwc = combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F32));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumNHWC, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumMixedDataLayoutNHWC, NEScaleMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumAlignCornersNHWC, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f32_shape_nhwc, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -422,15 +383,52 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::
 TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-const auto f16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape))
+const auto f16_shape      = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+const auto f16_shape_nhwc = combine(datasets::Small3DShapes(), framework::dataset::make("DataType", DataType::F16));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
     const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(Accessor(_target), _reference, valid_region, tolerance_f16);
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumNHWC, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumMixedDataLayoutNHWC, NEScaleMixedDataLayoutFixture<half>, framework::DatasetMode::PRECOMMIT, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunMediumAlignCornersNHWC, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_NHWC_DATASET(f16_shape_nhwc, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f16, 0.0f, abs_tolerance_f16);
 }
 TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -439,7 +437,16 @@ TEST_SUITE_END() // Float
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
 const auto u8_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<uint8_t>())), framework::dataset::make("DataType", DataType::U8));
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(u8_shape))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(u8_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(u8_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -449,9 +456,39 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<uint8_t>, framework::DatasetMode
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
 TEST_SUITE_END() // U8
+TEST_SUITE(S8)
+const auto s8_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int8_t>())), framework::dataset::make("DataType", DataType::S8));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_s8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_s8);
+}
+TEST_SUITE_END() // S8
 TEST_SUITE(S16)
 const auto s16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int16_t>())), framework::dataset::make("DataType", DataType::S16));
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_s16, tolerance_num_s16);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -466,7 +503,37 @@ TEST_SUITE_END() // Integer
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 const auto qasymm8_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<uint8_t>())), framework::dataset::make("DataType", DataType::QASYMM8));
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet,
+                       QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleAlignCornersSamplingPolicySet,
+                       QuantizationInfoSet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -477,21 +544,42 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::Da
 }
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-const auto qasymm8_signed_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int8_t>())), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape))
+const auto                          qasymm8_signed_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int8_t>())), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
+constexpr AbsoluteTolerance<int8_t> tolerance_qasymm8_signed{ 1 };
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleSamplingPolicySet, QuantizationInfoSet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
     ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+    validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+                       ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleAlignCornersSamplingPolicySet,
+                       QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // Scale
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Schaar.cpp b/tests/validation/NEON/Schaar.cpp
deleted file mode 100644
index 0b96eee70e..0000000000
--- a/tests/validation/NEON/Schaar.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/GradientDimensionDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ScharrFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Scharr)
-
-TEST_SUITE(W3x3)
-using NEScharr3x3Fixture = ScharrValidationFixture<Tensor, Accessor, NEScharr3x3, uint8_t, int16_t>;
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create scharr 3x3 configure function
-    NEScharr3x3 scharr;
-    scharr.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 1 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScharr3x3Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                        Format::U8)),
-                                                                                                datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEScharr3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                      Format::U8)),
-                                                                                              datasets::GradientDimensions()))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Select.cpp b/tests/validation/NEON/Select.cpp
index 9ac7a6f013..40744581b0 100644
--- a/tests/validation/NEON/Select.cpp
+++ b/tests/validation/NEON/Select.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -18,7 +18,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
@@ -41,8 +41,6 @@ namespace validation
 {
 namespace
 {
-auto configuration_dataset = combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                     framework::dataset::make("has_same_rank", { false, true }));
 auto run_small_dataset = combine(datasets::SmallShapes(), framework::dataset::make("has_same_rank", { false, true }));
 auto run_large_dataset = combine(datasets::LargeShapes(), framework::dataset::make("has_same_rank", { false, true }));
 } // namespace
@@ -100,26 +98,6 @@ TEST_SUITE(Float)
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, configuration_dataset,
-               shape, same_rank)
-{
-    const DataType dt = DataType::F16;
-
-    // Create tensors
-    Tensor ref_c = create_tensor<Tensor>(detail::select_condition_shape(shape, same_rank), DataType::U8);
-    Tensor ref_x = create_tensor<Tensor>(shape, dt);
-    Tensor ref_y = create_tensor<Tensor>(shape, dt);
-    Tensor dst   = create_tensor<Tensor>(shape, dt);
-
-    // Create and Configure function
-    NESelect select;
-    select.configure(&ref_c, &ref_x, &ref_y, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NESelectFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
@@ -141,26 +119,6 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, configuration_dataset,
-               shape, same_rank)
-{
-    const DataType dt = DataType::F32;
-
-    // Create tensors
-    Tensor ref_c = create_tensor<Tensor>(detail::select_condition_shape(shape, same_rank), DataType::U8);
-    Tensor ref_x = create_tensor<Tensor>(shape, dt);
-    Tensor ref_y = create_tensor<Tensor>(shape, dt);
-    Tensor dst   = create_tensor<Tensor>(shape, dt);
-
-    // Create and Configure function
-    NESelect select;
-    select.configure(&ref_c, &ref_x, &ref_y, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NESelectFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
@@ -182,7 +140,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // Select
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Slice.cpp b/tests/validation/NEON/Slice.cpp
index 69506a5fac..d5549c8cdb 100644
--- a/tests/validation/NEON/Slice.cpp
+++ b/tests/validation/NEON/Slice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,24 +63,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(arm_compute::test::datasets::SmallSliceDataset(), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F32 })),
-               shape, starts, ends, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst;
-
-    // Create and Configure function
-    NESlice slice;
-    slice.configure(&src, &dst, starts, ends);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NESliceFixture = SliceFixture<Tensor, Accessor, NESlice, T>;
 
@@ -129,7 +111,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // Slice
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Sobel.cpp b/tests/validation/NEON/Sobel.cpp
deleted file mode 100644
index c9ff7d6b95..0000000000
--- a/tests/validation/NEON/Sobel.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
-#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
-#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/SobelFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Sobel)
-
-TEST_SUITE(W3x3)
-using NESobel3x3Fixture = SobelValidationFixture<Tensor, Accessor, NESobel3x3, uint8_t, int16_t>;
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create sobel 3x3 configure function
-    NESobel3x3 sobel;
-    sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 1 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
-
-TEST_SUITE(X)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel3x3Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-TEST_SUITE_END()
-TEST_SUITE(Y)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel3x3Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE(XY)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel3x3Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel3x3Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(1));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE(W5x5)
-using NESobel5x5Fixture = SobelValidationFixture<Tensor, Accessor, NESobel5x5, uint8_t, int16_t>;
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create sobel 5x5 configure function
-    NESobel5x5 sobel;
-    sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 2 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 16);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(2);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_processed_elements(8);
-    calculator.set_access_offset(-2);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
-TEST_SUITE(X)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel5x5Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-TEST_SUITE_END()
-TEST_SUITE(Y)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel5x5Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE(XY)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel5x5Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel5x5Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(2));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE(W7x7)
-using NESobel7x7Fixture = SobelValidationFixture<Tensor, Accessor, NESobel7x7, uint8_t, int32_t>;
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S32);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S32);
-    dst_y.info()->set_format(Format::S32);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create sobel 7x7 configure function
-    NESobel7x7 sobel;
-    sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 3 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(3);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-3);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
-TEST_SUITE(X)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel7x7Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_X)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-}
-TEST_SUITE_END()
-TEST_SUITE(Y)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel7x7Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_Y)))
-{
-    // Validate output
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE(XY)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESobel7x7Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                       Format::U8)),
-                                                                                               framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NESobel7x7Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::Large2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                                                     Format::U8)),
-                                                                                             framework::dataset::make("GradientDimension", GradientDimension::GRAD_XY)))
-{
-    // Validate output
-    ValidRegion valid_region_x = shape_to_valid_region(_reference.first.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.first), _reference.first, valid_region_x);
-
-    ValidRegion valid_region_y = shape_to_valid_region(_reference.second.shape(), (_border_mode == BorderMode::UNDEFINED), BorderSize(3));
-    validate(Accessor(_target.second), _reference.second, valid_region_y);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 8af3847cf8..94d0866c38 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,21 +25,22 @@
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/cpu/kernels/CpuSoftmaxKernel.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/SoftmaxLayerFixture.h"
-
 namespace arm_compute
 {
 namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 /** Tolerance for float operations */
@@ -51,7 +52,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -62,47 +63,55 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 
 TEST_SUITE(NEON)
 TEST_SUITE(SoftmaxLayer)
-
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis value
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
-                                                                  QuantizationInfo(1.f/256, 0)),
-                                                     })),
-               framework::dataset::make("beta", { 1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                  2.0,
-                                                  1.0,
-                                                })),
-               framework::dataset::make("axis", { 1,
-                                                  1,
-                                                  1,
-                                                  -1,
-                                                  1,
-                                                  -3,
-                                                })),
-               framework::dataset::make("Expected", { false, false, false, true, true, false })),
-               input_info, output_info, beta, axis, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
+                                    QuantizationInfo(1.f/256, 12)),
+                        }),
+    make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+                        TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        }),
+    make("beta", { 1.0,
+                   2.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                }),
+    make("axis", { 0,
+                   0,
+                   0,
+                   1,
+                   0,
+                   -1,
+                   2,
+                   -3,
+                }),
+    make("Expected", { false, false, false, true, true, true, false, false })),
+    input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
 }
@@ -112,29 +121,80 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 template <typename T>
 using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftmaxLayer, T>;
 
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+    concat(
+        combine(
+            make("CpuExt", std::string("neon")),
+            make("DataType", { DataType::F32,
+                            DataType::F16,
+                            DataType::QASYMM8,
+                            DataType::QASYMM8_SIGNED})
+        ),
+        combine(
+            make("CpuExt", std::string("sme2")),
+            make("DataType", { DataType::F32,
+                            DataType::F16}))
+        ),
+        cpu_ext, data_type)
+{
+    using namespace cpu::kernels;
+
+    cpuinfo::CpuIsaInfo cpu_isa{};
+    cpu_isa.neon = (cpu_ext == "neon");
+    cpu_isa.sme2 = (cpu_ext == "sme2");
+    cpu_isa.fp16 = (data_type == DataType::F16);
+
+    const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
+        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */, CPUInfo::get().get_sme2_vector_length()},
+        cpu::KernelSelectionType::Preferred);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
+
+    std::string expected = cpu_ext + "_" + cpu_impl_dt(data_type) + "_softmax";
+    std::string actual   = selected_impl->name;
+
+    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                         framework::dataset::make("Axis", { 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 1, 2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f }),
+        make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
-                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 1 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -143,26 +203,30 @@ TEST_SUITE_END() //FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 1, 2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 1 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -175,29 +239,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor,
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                 framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                 combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                         framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { -1, 2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })),
+        make("Axis", { 0, 1, -2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                   combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                   framework::dataset::make("Axis", { 1 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::QASYMM8),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.0f })
+        ),
+        make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -205,20 +280,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framew
 TEST_SUITE_END() //QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { -1, 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { -2, 2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(
+            make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+            make("Beta", { 1.0f, 2.f })
+        ),
+        make("Axis", { 0, 1, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
diff --git a/tests/validation/NEON/SpaceToBatchLayer.cpp b/tests/validation/NEON/SpaceToBatchLayer.cpp
index fc8a800b48..61642c6ff9 100644
--- a/tests/validation/NEON/SpaceToBatchLayer.cpp
+++ b/tests/validation/NEON/SpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,7 +162,7 @@ FIXTURE_DATA_TEST_CASE(Large, NESpaceToBatchLayerQuantizedFixture<uint8_t>, fram
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // SpaceToBatch
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/SpaceToDepthLayer.cpp b/tests/validation/NEON/SpaceToDepthLayer.cpp
index 8d2726168c..6d87bc4fd4 100644
--- a/tests/validation/NEON/SpaceToDepthLayer.cpp
+++ b/tests/validation/NEON/SpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -107,7 +107,7 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // SpaceToDepthLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Split.cpp b/tests/validation/NEON/Split.cpp
index 5ad19a69ac..72df2ad663 100644
--- a/tests/validation/NEON/Split.cpp
+++ b/tests/validation/NEON/Split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,66 +91,6 @@ DATA_TEST_CASE(ValidateSplitShapes, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallSplitDataset(), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F32 })),
-               shape, axis, splits, data_type)
-{
-    // Create tensors
-    Tensor                 src = create_tensor<Tensor>(shape, data_type);
-    std::vector<Tensor>    dsts(splits);
-    std::vector<ITensor *> dsts_ptrs;
-    dsts_ptrs.reserve(splits);
-    for(auto &dst : dsts)
-    {
-        dsts_ptrs.emplace_back(&dst);
-    }
-
-    // Create and Configure function
-    NESplit split;
-    split.configure(&src, dsts_ptrs, axis);
-
-    // Validate valid regions
-    for(auto &dst : dsts)
-    {
-        const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-        validate(dst.info()->valid_region(), valid_region);
-    }
-}
-
-DATA_TEST_CASE(ConfigurationSplitShapes,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallSplitShapesDataset(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-               shape, axis, split_shapes, data_type)
-{
-    // Create tensors
-    Tensor              src = create_tensor<Tensor>(shape, data_type);
-    std::vector<Tensor> dsts;
-
-    for(const auto &split_shape : split_shapes)
-    {
-        Tensor dst = create_tensor<Tensor>(split_shape, data_type);
-        dsts.push_back(std::move(dst));
-    }
-
-    std::vector<ITensor *> dsts_ptrs;
-    for(auto &dst : dsts)
-    {
-        dsts_ptrs.emplace_back(&dst);
-    }
-
-    // Create and Configure function
-    NESplit split;
-    split.configure(&src, dsts_ptrs, axis);
-
-    // Validate valid regions
-    for(auto &dst : dsts)
-    {
-        const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-        validate(dst.info()->valid_region(), valid_region);
-    }
-}
-
 template <typename T>
 using NESplitFixture = SplitFixture<Tensor, ITensor, Accessor, NESplit, T>;
 
@@ -226,7 +166,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // Split
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/StackLayer.cpp b/tests/validation/NEON/StackLayer.cpp
index fbe4537771..3828010c7b 100644
--- a/tests/validation/NEON/StackLayer.cpp
+++ b/tests/validation/NEON/StackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,98 +44,74 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
 namespace
 {
 // *INDENT-OFF*
 // clang-format off
 /** Data types */
-const auto data_types = framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
+const auto data_types = make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
 
 /** Num tensors values to test */
-const auto n_values = framework::dataset::make("NumTensors", { 3, 4 });
+const auto n_values = make("NumTensors", { 3, 4 });
 
 /** Shapes 1D to test */
-const auto shapes_1d_small = combine(datasets::Small1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_small = combine(datasets::Small1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_small = combine(datasets::Small2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_small = combine(datasets::Small2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_small = combine(datasets::Small3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_small = combine(datasets::Small3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_small = combine(datasets::Small4DShapes(), framework::dataset::make("Axis", -4, 5));
+const auto shapes_4d_small = combine(datasets::Small4DShapes(), make("Axis", -4, 5));
 
 /** Shapes 1D to test */
-const auto shapes_1d_large = combine(datasets::Large1DShapes(), framework::dataset::make("Axis", -1, 2));
+const auto shapes_1d_large = combine(datasets::Large1DShapes(), make("Axis", -1, 2));
 
 /** Shapes 2D to test */
-const auto shapes_2d_large = combine(datasets::Medium2DShapes(), framework::dataset::make("Axis", -2, 3));
+const auto shapes_2d_large = combine(datasets::Medium2DShapes(), make("Axis", -2, 3));
 
 /** Shapes 3D to test */
-const auto shapes_3d_large = combine(datasets::Medium3DShapes(), framework::dataset::make("Axis", -3, 4));
+const auto shapes_3d_large = combine(datasets::Medium3DShapes(), make("Axis", -3, 4));
 
 /** Shapes 4D to test */
-const auto shapes_4d_large = combine(datasets::Medium4DShapes(), framework::dataset::make("Axis", -4, 5));
-
-/** Configuration test */
-void validate_configuration(TensorShape shape_in, int axis, DataType data_type, int num_tensors)
-{
-    // Wrap around negative values
-    const unsigned int axis_u = wrap_around(axis, static_cast<int>(shape_in.num_dimensions() + 1));
-
-    const TensorShape shape_dst = compute_stack_shape(TensorInfo(shape_in, 1, data_type), axis_u, num_tensors);
-
-    std::vector<Tensor>   tensors(num_tensors);
-    std::vector<ITensor*> src(num_tensors);
-
-    // Create vector of input tensors
-    for(int i = 0; i < num_tensors; ++i)
-    {
-        tensors[i] = create_tensor<Tensor>(shape_in, data_type);
-        src[i]     = &(tensors[i]);
-        ARM_COMPUTE_EXPECT(src[i]->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-
-    // Create tensors
-    Tensor dst = create_tensor<Tensor>(shape_dst, data_type);
-
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEStackLayer stack;
-    stack.configure(src, axis, &dst);
-}
+const auto shapes_4d_large = combine(datasets::Medium4DShapes(), make("Axis", -4, 5));
 } // namespace
 
 /** Fixture to use */
 template<typename T>
 using NEStackLayerFixture = StackLayerValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
 
+template<typename T>
+using NEStackLayerWithPaddingFixture = StackLayerWithPaddingValidationFixture<Tensor, ITensor, Accessor, NEStackLayer, T>;
+
 using namespace arm_compute::misc::shape_calculator;
 
 TEST_SUITE(NEON)
 TEST_SUITE(StackLayer)
 
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                                      framework::dataset::make("InputInfo",
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+make("InputInfo",
 {
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::U8) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(1U, 2U), 1, DataType::U8) , TensorInfo(TensorShape(1U, 2U), 1, DataType::U8), TensorInfo(TensorShape(1U, 2U), 1, DataType::U8)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(2U, 3U), 1, DataType::S32) },
-    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)}, 
+    std::vector<TensorInfo>{ TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32), TensorInfo(TensorShape(7U, 5U, 3U, 8U, 2U), 1, DataType::S32)},
     std::vector<TensorInfo>{ TensorInfo(TensorShape(9U, 8U), 1, DataType::S32) },
 }),
-framework::dataset::make("OutputInfo",
+make("OutputInfo",
 {
     TensorInfo(TensorShape(1U, 9U, 8U), 1, DataType::U8),   // Passes, stack 1 tensor on x axis
     TensorInfo(TensorShape(1U, 3U, 2U), 1, DataType::U8),   // Passes, stack 3 tensors on y axis
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::S32),  // fails axis <  (- input's rank)
     TensorInfo(TensorShape(3U, 7U, 5U), 1, DataType::S32),  // fails, input dimensions > 4
     TensorInfo(TensorShape(1U, 2U, 3U), 1, DataType::U8),   // fails mismatching data types
-})),
-framework::dataset::make("Axis", { -3, 1, -4, -3, 1 })),
-framework::dataset::make("Expected", { true, true, false, false, false })),
+}),
+make("Axis", { -3, 1, -4, -3, 1 }),
+make("Expected", { true, true, false, false, false })),
 input_info, output_info, axis, expected)
 {
     std::vector<TensorInfo>    ti(input_info);
@@ -148,29 +124,20 @@ input_info, output_info, axis, expected)
 }
 
 TEST_SUITE(Shapes1D)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(shapes_1d_small,
-                                                                           data_types),
-                                                                           n_values),
-shape_in, axis, data_type, num_tensors)
-{
-    validate_configuration(shape_in, axis, data_type, num_tensors);
-}
-
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -179,18 +146,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -199,18 +166,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_1d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_1d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_1d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -219,29 +186,20 @@ TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes1D
 
 TEST_SUITE(Shapes2D)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(shapes_2d_small,
-                                                                           data_types),
-                                                                           n_values),
-shape_in, axis, data_type, num_tensors)
-{
-    validate_configuration(shape_in, axis, data_type, num_tensors);
-}
-
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -250,18 +208,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -270,18 +228,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_2d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_2d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_2d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -290,28 +248,20 @@ TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes2D
 
 TEST_SUITE(Shapes3D)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(shapes_3d_small,
-                                                                           data_types),
-                                                                           n_values),
-shape_in, axis, data_type, num_tensors)
-{
-    validate_configuration(shape_in, axis, data_type, num_tensors);
-}
-
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -320,18 +270,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -340,18 +290,18 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_3d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_3d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_3d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -360,28 +310,31 @@ TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes3D
 
 TEST_SUITE(Shapes4D)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(shapes_4d_small,
-                                                                           data_types),
-                                                                           n_values),
-shape_in, axis, data_type, num_tensors)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
-    validate_configuration(shape_in, axis, data_type, num_tensors);
+    // Validate output
+    validate(Accessor(_target), _reference);
 }
 
-TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+// Testing the case with padding for only 4d shapes and for one data type. This is because the underlying code
+// path depends only on the padding, which isn't affected by the shapes or data types.
+FIXTURE_DATA_TEST_CASE(RunSmallWithPadding, NEStackLayerWithPaddingFixture<int>, framework::DatasetMode::ALL,
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<int>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S32 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S32 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -390,18 +343,18 @@ TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<short>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<short>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S16 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S16 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -410,26 +363,39 @@ TEST_SUITE_END() // S16
 
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<char>, framework::DatasetMode::ALL,
-                                                           combine(combine(shapes_4d_small,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_small,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEStackLayerFixture<char>, framework::DatasetMode::NIGHTLY,
-                                                           combine(combine(shapes_4d_large,
-                                                                           framework::dataset::make("DataType", { DataType::S8 })),
-                                                                           n_values))
+    combine(shapes_4d_large,
+            make("DataType", { DataType::S8 }),
+            n_values))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes4D
+
+TEST_SUITE(HighDimensional)
+// The Cpu implementation supports tensors of size 4D+, but reference implementation does not.
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, NEStackLayerFixture<char>, framework::DatasetMode::DISABLED,
+    combine(make("Shape", { TensorShape{2U, 3U, 4U, 5U, 3U} }),
+            make("Axis", { 5, 0, -3, 2 }),
+            make("DataType", { DataType::S8 }),
+            make("NumTensors", { 3 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // HighDimensional
 TEST_SUITE_END() // StackLayer
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/StridedSlice.cpp b/tests/validation/NEON/StridedSlice.cpp
index f94aed0d51..a1b3cef801 100644
--- a/tests/validation/NEON/StridedSlice.cpp
+++ b/tests/validation/NEON/StridedSlice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,24 +65,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(arm_compute::test::datasets::SmallStridedSliceDataset(), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F32 })),
-               shape, starts, ends, strides, begin_mask, end_mask, shrink_mask, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst;
-
-    // Create and Configure function
-    NEStridedSlice strided_slice;
-    strided_slice.configure(&src, &dst, starts, ends, strides, begin_mask, end_mask, shrink_mask);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEStridedSliceFixture = StridedSliceFixture<Tensor, Accessor, NEStridedSlice, T>;
 
@@ -131,7 +113,7 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // StridedSlice
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/NEON/TableLookup.cpp b/tests/validation/NEON/TableLookup.cpp
deleted file mode 100644
index 0eae2eaa23..0000000000
--- a/tests/validation/NEON/TableLookup.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-#include "tests/NEON/Accessor.h"
-#include "tests/NEON/LutAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-
-#include "tests/validation/Helpers.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/TableLookupFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(TableLookup)
-
-template <typename T>
-using NETableLookupFixture = TableLookupValidationFixture<Tensor, Accessor, NETableLookup, LutAccessor<T>, Lut, T>;
-TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-               shape, data_type)
-{
-    // Create Lut
-    const int num_elem = (data_type == DataType::U8) ? std::numeric_limits<uint8_t>::max() + 1 : std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::lowest() + 1;
-    Lut       lut(num_elem, data_type);
-
-    switch(data_type)
-    {
-        case DataType::U8:
-            fill_lookuptable(LutAccessor<uint8_t>(lut));
-            break;
-        case DataType::S16:
-            fill_lookuptable(LutAccessor<int16_t>(lut));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    // Create and Configure function
-    NETableLookup table_lookup;
-    table_lookup.configure(&src, &lut, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-FIXTURE_DATA_TEST_CASE(RunSmallU8, NETableLookupFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLargeU8, NETableLookupFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmallS16, NETableLookupFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S16)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-FIXTURE_DATA_TEST_CASE(RunLargeS16, NETableLookupFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::S16)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Threshold.cpp b/tests/validation/NEON/Threshold.cpp
deleted file mode 100644
index 7cddf7ccb7..0000000000
--- a/tests/validation/NEON/Threshold.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/datasets/ThresholdDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/ThresholdFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(Threshold)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), datasets::MixedThresholdDataset()),
-                                                                   framework::dataset::make("DataType", DataType::U8)),
-               shape, threshold, false_value, true_value, type, upper, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEThreshold thrsh;
-    thrsh.configure(&src, &dst, threshold, false_value, true_value, type, upper);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-template <typename T>
-using ThresholdFixture = ThresholdValidationFixture<Tensor, Accessor, NEThreshold, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, ThresholdFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), datasets::MixedThresholdDataset()),
-                                                                                                       framework::dataset::make("DataType",
-                                                                                                               DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, ThresholdFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), datasets::MixedThresholdDataset()),
-                                                                                                     framework::dataset::make("DataType",
-                                                                                                             DataType::U8)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/Tile.cpp b/tests/validation/NEON/Tile.cpp
index d033f5924a..2246c491f2 100644
--- a/tests/validation/NEON/Tile.cpp
+++ b/tests/validation/NEON/Tile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -134,7 +134,7 @@ TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // Tile
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Transpose.cpp b/tests/validation/NEON/Transpose.cpp
index 1f38fcc677..5c771286e3 100644
--- a/tests/validation/NEON/Transpose.cpp
+++ b/tests/validation/NEON/Transpose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/NEON/UNIT/DynamicTensor.cpp b/tests/validation/NEON/UNIT/DynamicTensor.cpp
index 55ade8c9a7..373cc574c8 100644
--- a/tests/validation/NEON/UNIT/DynamicTensor.cpp
+++ b/tests/validation/NEON/UNIT/DynamicTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,7 +117,7 @@ FIXTURE_DATA_TEST_CASE(DynamicTensorType2Pipeline, NEDynamicTensorType2PipelineF
 }
 TEST_SUITE_END() // DynamicTensor
 TEST_SUITE_END() // UNIT
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/UNIT/MemoryManager.cpp b/tests/validation/NEON/UNIT/MemoryManager.cpp
index 8defc7a654..2c57b534fe 100644
--- a/tests/validation/NEON/UNIT/MemoryManager.cpp
+++ b/tests/validation/NEON/UNIT/MemoryManager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,15 +62,15 @@ TEST_CASE(BlobMemoryManagerSimpleWithinFunctionLevel, framework::DatasetMode::AL
     norm_layer_1.configure(&src, &dst, NormalizationLayerInfo(NormType::CROSS_MAP, 3));
     norm_layer_2.configure(&src, &dst, NormalizationLayerInfo(NormType::IN_MAP_1D, 3));
 
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
     // Allocate tensors
     src.allocator()->allocate();
     dst.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
     // Finalize memory manager
     mm->populate(allocator, 1 /* num_pools */);
diff --git a/tests/validation/NEON/UNIT/RuntimeContext.cpp b/tests/validation/NEON/UNIT/RuntimeContext.cpp
index 0823085883..e126aded28 100644
--- a/tests/validation/NEON/UNIT/RuntimeContext.cpp
+++ b/tests/validation/NEON/UNIT/RuntimeContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,24 @@ namespace validation
 {
 TEST_SUITE(NEON)
 TEST_SUITE(UNIT)
+#if defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+TEST_CASE(CpuCapacity, framework::DatasetMode::ALL)
+{
+    CPUInfo& ci =  arm_compute::Scheduler::get().cpu_info();
+    const uint32_t total_num_cpus = ci.get_cpu_num();
+    const uint32_t nonlittle_num_cpus = ci.get_cpu_num_excluding_little();
+    const bool has_lmb = ci.cpu_has_little_mid_big();
+    const uint32_t num_threads = arm_compute::Scheduler::get().num_threads();
+
+    if(has_lmb){
+        ARM_COMPUTE_EXPECT(total_num_cpus!=nonlittle_num_cpus , framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(num_threads==nonlittle_num_cpus , framework::LogLevel::ERRORS);
+    }
+}
+#endif /* defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+
 TEST_SUITE(RuntimeContext)
 
 TEST_CASE(Scheduler, framework::DatasetMode::ALL)
@@ -57,14 +75,14 @@ TEST_CASE(Scheduler, framework::DatasetMode::ALL)
     RuntimeContext ctx;
 
     // Check if it's been initialised properly
-    ARM_COMPUTE_EXPECT(ctx.scheduler() != nullptr, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(ctx.asset_manager() == nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(ctx.scheduler() != nullptr);
+    ARM_COMPUTE_ASSERT(ctx.asset_manager() == nullptr);
 
     // Create a Scheduler
     auto scheduler = SchedulerFactory::create();
     ctx.set_scheduler(scheduler.get());
     // Check if the scheduler has been properly setup
-    ARM_COMPUTE_EXPECT(ctx.scheduler() != nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(ctx.scheduler() != nullptr);
 
     // Create a new activation function
     NEActivationLayer act_layer(&ctx);
@@ -74,14 +92,14 @@ TEST_CASE(Scheduler, framework::DatasetMode::ALL)
 
     act_layer.configure(&src, &dst, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR));
 
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
     // Allocate tensors
     src.allocator()->allocate();
     dst.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
 
     float min_bound = 0;
     float max_bound = 0;
@@ -117,10 +135,10 @@ TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
     act_layer_thread0.configure(&src_t0, &dst_t0, activation_info);
     act_layer_thread1.configure(&src_t1, &dst_t1, activation_info);
 
-    ARM_COMPUTE_EXPECT(src_t0.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_t0.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src_t1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(src_t0.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst_t0.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(src_t1.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(dst_t1.info()->is_resizable());
 
     // Allocate tensors
     src_t0.allocator()->allocate();
@@ -128,8 +146,8 @@ TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
     src_t1.allocator()->allocate();
     dst_t1.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(!src_t0.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!src_t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!src_t0.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!src_t1.info()->is_resizable());
 
     float min_bound = 0;
     float max_bound = 0;
@@ -159,7 +177,7 @@ TEST_CASE(MultipleThreadedScheduller, framework::DatasetMode::ALL)
 
 TEST_SUITE_END() // RuntimeContext
 TEST_SUITE_END() // UNIT
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/UNIT/TensorAllocator.cpp b/tests/validation/NEON/UNIT/TensorAllocator.cpp
index 21e4e71fad..0aab9ef9b5 100644
--- a/tests/validation/NEON/UNIT/TensorAllocator.cpp
+++ b/tests/validation/NEON/UNIT/TensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,6 @@
 #include "arm_compute/runtime/MemoryRegion.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
-#include "support/MemorySupport.h"
-
 #include "tests/Globals.h"
 #include "tests/Utils.h"
 #include "tests/framework/Asserts.h"
@@ -58,37 +56,37 @@ TEST_CASE(ImportMemory, framework::DatasetMode::ALL)
 
     // Allocate memory buffer
     const size_t total_size = info.total_size();
-    auto         data       = support::cpp14::make_unique<uint8_t[]>(total_size);
+    auto         data       = std::make_unique<uint8_t[]>(total_size);
 
     // Negative case : Import nullptr
     Tensor t1;
     t1.allocator()->init(info);
-    ARM_COMPUTE_EXPECT(!bool(t1.allocator()->import_memory(nullptr)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t1.allocator()->import_memory(nullptr)));
+    ARM_COMPUTE_ASSERT(t1.info()->is_resizable());
 
     // Negative case : Import misaligned pointer
     Tensor       t2;
     const size_t required_alignment = 339;
     t2.allocator()->init(info, required_alignment);
-    ARM_COMPUTE_EXPECT(!bool(t2.allocator()->import_memory(data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t2.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t2.allocator()->import_memory(data.get())));
+    ARM_COMPUTE_ASSERT(t2.info()->is_resizable());
 
     // Negative case : Import memory to a tensor that is memory managed
     Tensor      t3;
     MemoryGroup mg;
     t3.allocator()->set_associated_memory_group(&mg);
-    ARM_COMPUTE_EXPECT(!bool(t3.allocator()->import_memory(data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t3.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(!bool(t3.allocator()->import_memory(data.get())));
+    ARM_COMPUTE_ASSERT(t3.info()->is_resizable());
 
     // Positive case : Set raw pointer
     Tensor t4;
     t4.allocator()->init(info);
-    ARM_COMPUTE_EXPECT(bool(t4.allocator()->import_memory(data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!t4.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t4.buffer() == reinterpret_cast<uint8_t *>(data.get()), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(t4.allocator()->import_memory(data.get())));
+    ARM_COMPUTE_ASSERT(!t4.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(t4.buffer() == reinterpret_cast<uint8_t *>(data.get()));
     t4.allocator()->free();
-    ARM_COMPUTE_EXPECT(t4.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t4.buffer() == nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(t4.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(t4.buffer() == nullptr);
 }
 
 TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
@@ -111,13 +109,13 @@ TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
     const size_t total_size_in_elems = tensor.info()->tensor_shape().total_size();
     const size_t total_size_in_bytes = tensor.info()->total_size();
     size_t       space               = total_size_in_bytes + required_alignment;
-    auto         raw_data            = support::cpp14::make_unique<uint8_t[]>(space);
+    auto         raw_data            = std::make_unique<uint8_t[]>(space);
 
     void *aligned_ptr = raw_data.get();
-    support::cpp11::align(required_alignment, total_size_in_bytes, aligned_ptr, space);
+    std::align(required_alignment, total_size_in_bytes, aligned_ptr, space);
 
-    ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(aligned_ptr)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(aligned_ptr)));
+    ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
     // Fill tensor
     std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -139,7 +137,7 @@ TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
 
     // Release resources
     tensor.allocator()->free();
-    ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
 }
 
 TEST_CASE(ImportMemoryMallocPadded, framework::DatasetMode::ALL)
@@ -160,10 +158,10 @@ TEST_CASE(ImportMemoryMallocPadded, framework::DatasetMode::ALL)
 
     // Allocate and import tensor
     const size_t total_size_in_bytes = tensor.info()->total_size();
-    auto         raw_data            = support::cpp14::make_unique<uint8_t[]>(total_size_in_bytes);
+    auto         raw_data            = std::make_unique<uint8_t[]>(total_size_in_bytes);
 
-    ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(raw_data.get())), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(raw_data.get())));
+    ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
     // Fill tensor while accounting padding
     std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -192,10 +190,10 @@ TEST_CASE(ImportMemoryMallocPadded, framework::DatasetMode::ALL)
 
     // Release resources
     tensor.allocator()->free();
-    ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
 }
 
-#if !defined(BARE_METAL)
+#if !defined(_WIN64) && !defined(BARE_METAL)
 TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 {
     const ActivationLayerInfo act_info(ActivationLayerInfo::ActivationFunction::RELU);
@@ -223,12 +221,12 @@ TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 
     // Map file
     utils::mmap_io::MMappedFile mmapped_file("test_mmap_import.bin", 0 /** Whole file */, 0);
-    ARM_COMPUTE_EXPECT(mmapped_file.is_mapped(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(mmapped_file.is_mapped());
     unsigned char *data = mmapped_file.data();
 
     // Import memory mapped memory
-    ARM_COMPUTE_EXPECT(bool(tensor.allocator()->import_memory(data)), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(!tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(data)));
+    ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
 
     // Fill tensor
     std::uniform_real_distribution<float> distribution(-5.f, 5.f);
@@ -250,9 +248,9 @@ TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
 
     // Release resources
     tensor.allocator()->free();
-    ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
 }
-#endif // !defined(BARE_METAL)
+#endif // !defined(_WIN64) && !defined(BARE_METAL)
 
 TEST_CASE(AlignedAlloc, framework::DatasetMode::ALL)
 {
@@ -264,7 +262,7 @@ TEST_CASE(AlignedAlloc, framework::DatasetMode::ALL)
     t.allocator()->init(info, requested_alignment);
     t.allocator()->allocate();
 
-    ARM_COMPUTE_EXPECT(t.buffer() != nullptr, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_ASSERT(t.buffer() != nullptr);
     ARM_COMPUTE_EXPECT(t.allocator()->alignment() == requested_alignment, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(arm_compute::utility::check_aligned(reinterpret_cast<void *>(t.buffer()), requested_alignment),
                        framework::LogLevel::ERRORS);
diff --git a/tests/validation/NEON/Unstack.cpp b/tests/validation/NEON/Unstack.cpp
index 6be80df3c6..3e8f1ff324 100644
--- a/tests/validation/NEON/Unstack.cpp
+++ b/tests/validation/NEON/Unstack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -122,7 +122,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEUnstackFixture<uint8_t>, framework::DatasetMo
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE_END() // Unstack
-TEST_SUITE_END() // NEON
+TEST_SUITE_END() // Neon
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/Upsample.cpp b/tests/validation/NEON/Upsample.cpp
deleted file mode 100644
index 50aadeff05..0000000000
--- a/tests/validation/NEON/Upsample.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/UpsampleLayerFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(NEON)
-TEST_SUITE(UpsampleLayer)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, (combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32))),
-               input_shape, data_type)
-{
-    InterpolationPolicy policy = InterpolationPolicy::NEAREST_NEIGHBOR;
-    Size2D              info   = Size2D(2, 2);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(input_shape, data_type, 1);
-    Tensor dst;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEUpsampleLayer upsample;
-    upsample.configure(&src, &dst, info, policy);
-
-    // Validate valid region
-    const ValidRegion src_valid_region = shape_to_valid_region(src.info()->tensor_shape());
-    const ValidRegion dst_valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-
-    validate(src.info()->valid_region(), src_valid_region);
-    validate(dst.info()->valid_region(), dst_valid_region);
-}
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(10U, 10U, 2U), 1, DataType::F32), // Mismatching data type
-                                            TensorInfo(TensorShape(10U, 10U, 2U), 1, DataType::F32), // Invalid output shape
-                                            TensorInfo(TensorShape(10U, 10U, 2U), 1, DataType::F32), // Invalid stride
-                                            TensorInfo(TensorShape(10U, 10U, 2U), 1, DataType::F32), // Invalid policy
-                                            TensorInfo(TensorShape(32U, 32U), 1, DataType::F32),
-                                          }),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(20U, 20U, 2U), 1, DataType::F16),
-                                            TensorInfo(TensorShape(20U, 10U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(20U, 20U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(20U, 20U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(64U, 64U), 1, DataType::F32),
-                                          })),
-    framework::dataset::make("PadInfo", { Size2D(2, 2),
-                                          Size2D(2, 2),
-                                          Size2D(1, 1),
-                                          Size2D(2, 2),
-                                          Size2D(2, 2),
-                                           })),
-   framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                  InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                  InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                  InterpolationPolicy::BILINEAR,
-                                                  InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                })),
-    framework::dataset::make("Expected", { false, false, false, false, true })),
-    input_info, output_info, pad_info, policy, expected)
-{
-    bool is_valid = bool(NEUpsampleLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pad_info, policy));
-    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-template <typename T>
-using NEUpsampleLayerFixture = UpsampleLayerFixture<Tensor, Accessor, NEUpsampleLayer, T>;
-
-template <typename T>
-using NEUpsampleLayerQuantizedFixture = UpsampleLayerQuantizedFixture<Tensor, Accessor, NEUpsampleLayer, T>;
-
-TEST_SUITE(Float)
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEUpsampleLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                   framework::dataset::make("PadInfo", { Size2D(2, 2) })),
-                                                                                                           framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // FP32
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEUpsampleLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                  framework::dataset::make("DataType",
-                                                                                                                          DataType::F16)),
-                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                  framework::dataset::make("PadInfo", { Size2D(2, 2) })),
-                                                                                                          framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // FP16
-#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-TEST_SUITE_END() // Float
-
-TEST_SUITE(Quantized)
-TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEUpsampleLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                      framework::dataset::make("PadInfo", { Size2D(2, 2) })),
-                                                                                                                      framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })),
-                                                                                                                      framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255.f, 10))))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // QASYMM8
-TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEUpsampleLayerQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     framework::dataset::make("PadInfo", { Size2D(2, 2) })),
-                                                                                                                     framework::dataset::make("UpsamplingPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })),
-                                                                                                                     framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255.f, 10))))
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // QASYMM8_SIGNED
-TEST_SUITE_END() // Quantized
-
-TEST_SUITE_END() // UpsampleLayer
-TEST_SUITE_END() // NEON
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/WarpAffine.cpp b/tests/validation/NEON/WarpAffine.cpp
deleted file mode 100644
index 4339d02141..0000000000
--- a/tests/validation/NEON/WarpAffine.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/InterpolationPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/WarpAffineFixture.h"
-#include "tests/validation/reference/Utils.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance */
-constexpr AbsoluteTolerance<uint8_t> tolerance(1);
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(WarpAffine)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                           framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                   datasets::BorderModes()),
-               shape, data_type, policy, border_mode)
-{
-    // Generate a random constant value if border_mode is constant
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    uint8_t                                constant_border_value = distribution_u8(gen);
-
-    // Create the matrix
-    std::array<float, 9> matrix{ {} };
-    fill_warp_matrix<9>(matrix);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEWarpAffine warp_affine;
-    warp_affine.configure(&src, &dst, matrix, policy, border_mode, constant_border_value);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 1);
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize read_padding(1);
-    const PaddingSize write_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
-}
-
-template <typename T>
-using NEWarpAffineFixture = WarpAffineValidationFixture<Tensor, Accessor, NEWarpAffine, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEWarpAffineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                                                                  framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                          datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance, 0.02f);
-}
-DISABLED_FIXTURE_DATA_TEST_CASE(RunLarge, NEWarpAffineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::U8)),
-                                                                                                                 framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                 datasets::BorderModes()))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, _valid_mask, tolerance, 0.02f);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/WarpPerspective.cpp b/tests/validation/NEON/WarpPerspective.cpp
deleted file mode 100644
index 2f21913027..0000000000
--- a/tests/validation/NEON/WarpPerspective.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/WarpPerspectiveFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
-constexpr float                      tolerance_number = 0.2f;
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(WarpPerspective)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                           framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                   datasets::BorderModes()),
-               shape, data_type, policy, border_mode)
-{
-    uint8_t constant_border_value = 0;
-
-    // Generate a random constant value if border_mode is constant
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-        constant_border_value = distribution_u8(gen);
-    }
-
-    // Create the matrix
-    std::array<float, 9> matrix = { { 0 } };
-    fill_warp_matrix<9>(matrix);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEWarpPerspective warp_perspective;
-    warp_perspective.configure(&src, &dst, matrix, policy, border_mode, constant_border_value);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 1);
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize read_padding(1);
-    const PaddingSize write_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
-}
-
-template <typename T>
-using NEWarpPerspectiveFixture = WarpPerspectiveValidationFixture<Tensor, Accessor, NEWarpPerspective, T>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEWarpPerspectiveFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                       DataType::U8)),
-                                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                               datasets::BorderModes()))
-{
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEWarpPerspectiveFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::U8)),
-                                                                                                                     framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                             datasets::BorderModes()))
-{
-    validate(Accessor(_target), _reference, _valid_mask, tolerance_value, tolerance_number);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/WidthConcatenateLayer.cpp b/tests/validation/NEON/WidthConcatenateLayer.cpp
index 3edf136cab..123a77276a 100644
--- a/tests/validation/NEON/WidthConcatenateLayer.cpp
+++ b/tests/validation/NEON/WidthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,7 +69,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {
diff --git a/tests/validation/NEON/YOLOLayer.cpp b/tests/validation/NEON/YOLOLayer.cpp
deleted file mode 100644
index 0edbc4e11f..0000000000
--- a/tests/validation/NEON/YOLOLayer.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ActivationFunctionsDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/YOLOLayerFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-/** Tolerance */
-constexpr AbsoluteTolerance<float> tolerance_f32(1e-6f);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr RelativeTolerance<float> tolerance_f16(0.01f);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-/** Floating point data sets. */
-const auto YOLODataset = combine(combine(combine(combine(framework::dataset::make("InPlace", { false, true }), framework::dataset::make("ActivationFunction",
-                                                         ActivationLayerInfo::ActivationFunction::LOGISTIC)),
-                                                 framework::dataset::make("AlphaBeta", { 0.5f, 1.f })),
-                                         framework::dataset::make("Classes", 40)),
-                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(YOLOLayer)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::U8),  // Wrong input data type
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // Invalid activation info
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // Wrong output data type
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // wrong number of classes
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // Mismatching shapes
-                                                       TensorInfo(TensorShape(17U, 16U, 6U), 1, DataType::F32),  // shrink window
-                                                       TensorInfo(TensorShape(17U, 16U, 7U), 1, DataType::F32),  // channels not multiple of (num_classes + 5)
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),  // Valid
-                                                     }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::U16),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 11U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 7U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(16U, 16U, 6U), 1, DataType::F32),
-                                                     })),
-               framework::dataset::make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
-                                                     })),
-               framework::dataset::make("Numclasses", { 1, 1, 1, 0, 1, 1, 1, 1
-                                                     })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, true})),
-               input_info, output_info, act_info, num_classes, expected)
-{
-    ARM_COMPUTE_EXPECT(bool(NEYOLOLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), act_info, num_classes)) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-template <typename T>
-using NEYOLOLayerFixture = YOLOValidationFixture<Tensor, Accessor, NEYOLOLayer, T>;
-
-TEST_SUITE(Float)
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEYOLOLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallYOLOShapes(), YOLODataset), framework::dataset::make("DataType",
-                                                                                                       DataType::F32)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEYOLOLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeYOLOShapes(), YOLODataset), framework::dataset::make("DataType",
-                                                                                                     DataType::F32)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END() // FP32
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEYOLOLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallYOLOShapes(), YOLODataset), framework::dataset::make("DataType",
-                                                                                                      DataType::F16)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEYOLOLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeYOLOShapes(), YOLODataset), framework::dataset::make("DataType",
-                                                                                                    DataType::F16)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16);
-}
-TEST_SUITE_END() // FP16
-#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-TEST_SUITE_END() // Float
-
-TEST_SUITE_END() // YOLOLayer
-TEST_SUITE_END() // NEON
-} // namespace validation
-} // namespace test
-} // namespace arm_compute