10 files changed, 306 insertions, 53 deletions
diff --git a/tests/datasets/ScatterDataset.h b/tests/datasets/ScatterDataset.h
index c0858941db..8fd4448d2d 100644
--- a/tests/datasets/ScatterDataset.h
+++ b/tests/datasets/ScatterDataset.h
@@ -179,10 +179,47 @@ public:
         // NOTE: Config is src, updates, indices, output.
         // NOTE: Updates/Indices tensors are now batched.
         // NOTE: indices.shape.x = (updates_batched) ? (src.num_dimensions - updates.num_dimensions) + 2 : (src.num_dimensions - updates.num_dimensions) + 1
+        // k is the number of batch dimensions
+        // k = 2
         add_config(TensorShape(6U, 5U), TensorShape(6U, 2U, 2U), TensorShape(1U, 2U, 2U), TensorShape(6U, 5U));
+        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(5U, 5U, 6U, 2U), TensorShape(3U, 6U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
+
+        // k = 3
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 2U, 2U, 2U), TensorShape(1U, 2U, 2U, 2U), TensorShape(6U, 5U));
+        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(5U, 5U, 3U, 6U, 2U), TensorShape(3U, 3U, 6U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
+
+        // k = 4
+        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(5U, 6U, 2U, 3U, 2U), TensorShape(4U, 6U, 2U, 3U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
+
+        // k = 5
+        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(5U, 3U, 4U, 3U, 2U, 2U), TensorShape(4U, 3U, 4U, 3U, 2U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
+    }
+};
+
+class SmallScatterScalarDataset final : public ScatterDataset
+{
+public:
+    // batched scalar case
+    SmallScatterScalarDataset()
+    {
+        add_config(TensorShape(6U, 5U), TensorShape(6U), TensorShape(2U, 6U), TensorShape(6U, 5U));
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 6U), TensorShape(2U, 6U, 6U), TensorShape(6U, 5U));
+        add_config(TensorShape(3U, 3U, 6U, 5U), TensorShape(6U, 6U), TensorShape(4U, 6U, 6U), TensorShape(3U, 3U, 6U, 5U));
+    }
+};
+
+// This dataset is for data types that does not require full testing. It contains selected tests from the above.
+class SmallScatterMixedDataset final : public ScatterDataset
+{
+public:
+    SmallScatterMixedDataset()
+    {
+        add_config(TensorShape(10U), TensorShape(2U), TensorShape(1U, 2U), TensorShape(10U));
+        add_config(TensorShape(9U, 3U, 4U), TensorShape(9U, 3U, 2U), TensorShape(1U, 2U), TensorShape(9U, 3U, 4U));
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 6U), TensorShape(2U, 6U, 6U), TensorShape(6U, 5U));
+        add_config(TensorShape(35U, 4U, 3U, 2U, 2U), TensorShape(35U, 4U), TensorShape(4U, 4U), TensorShape(35U, 4U, 3U, 2U, 2U));
+        add_config(TensorShape(11U, 3U, 3U, 2U, 4U), TensorShape(11U, 3U, 3U, 4U), TensorShape(2U, 4U), TensorShape(11U, 3U, 3U, 2U, 4U));
         add_config(TensorShape(6U, 5U, 2U), TensorShape(6U, 2U, 2U), TensorShape(2U, 2U, 2U), TensorShape(6U, 5U, 2U));
-        add_config(TensorShape(6U, 5U, 2U, 2U), TensorShape(3U, 2U), TensorShape(4U, 3U, 2U), TensorShape(6U, 5U, 2U, 2U));
-        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(6U, 2U), TensorShape(5U, 6U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
     }
 };
 } // namespace datasets
diff --git a/tests/validation/CL/ScatterLayer.cpp b/tests/validation/CL/ScatterLayer.cpp
index 4a2462c7d2..b1531eb64a 100644
--- a/tests/validation/CL/ScatterLayer.cpp
+++ b/tests/validation/CL/ScatterLayer.cpp
@@ -41,6 +41,8 @@ namespace validation
 namespace
 {
 RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
+RelativeTolerance<float> tolerance_f16(0.02f); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
+RelativeTolerance<int32_t> tolerance_int(0); /**< Tolerance value for comparing reference's output against implementation's output for integer data types */
 } // namespace
 
 template <typename T>
@@ -53,6 +55,7 @@ TEST_SUITE(Scatter)
 DATA_TEST_CASE(Validate, framework::DatasetMode::PRECOMMIT, zip(
     make("InputInfo", { TensorInfo(TensorShape(9U), 1, DataType::F32),    // Mismatching data types
                         TensorInfo(TensorShape(15U), 1, DataType::F32),   // Valid
+                        TensorInfo(TensorShape(15U), 1, DataType::U8),   // Valid
                         TensorInfo(TensorShape(8U), 1, DataType::F32),
                         TensorInfo(TensorShape(217U), 1, DataType::F32),    // Mismatch input/output dims.
                         TensorInfo(TensorShape(217U), 1, DataType::F32),    // Updates dim higher than Input/Output dims.
@@ -63,6 +66,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::PRECOMMIT, zip(
     }),
     make("UpdatesInfo",{TensorInfo(TensorShape(3U), 1, DataType::F16),
                         TensorInfo(TensorShape(15U), 1, DataType::F32),
+                        TensorInfo(TensorShape(15U), 1, DataType::U8),
                         TensorInfo(TensorShape(2U), 1, DataType::F32),
                         TensorInfo(TensorShape(217U), 1, DataType::F32),
                         TensorInfo(TensorShape(217U, 3U), 1, DataType::F32),
@@ -73,6 +77,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::PRECOMMIT, zip(
     }),
     make("IndicesInfo",{TensorInfo(TensorShape(1U, 3U), 1, DataType::S32),
                         TensorInfo(TensorShape(1U, 15U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 15U), 1, DataType::S32),
                         TensorInfo(TensorShape(1U, 2U), 1, DataType::S32),
                         TensorInfo(TensorShape(1U, 271U), 1, DataType::S32),
                         TensorInfo(TensorShape(1U, 271U), 1, DataType::S32),
@@ -83,6 +88,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::PRECOMMIT, zip(
     }),
     make("OutputInfo",{TensorInfo(TensorShape(9U), 1, DataType::F16),
                        TensorInfo(TensorShape(15U), 1, DataType::F32),
+                       TensorInfo(TensorShape(15U), 1, DataType::U8),
                        TensorInfo(TensorShape(8U), 1, DataType::F32),
                        TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
                        TensorInfo(TensorShape(271U), 1, DataType::F32),
@@ -93,6 +99,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::PRECOMMIT, zip(
     }),
     make("ScatterInfo",{ ScatterInfo(ScatterFunction::Add, false),
                          ScatterInfo(ScatterFunction::Max, false),
+                         ScatterInfo(ScatterFunction::Max, false),
                          ScatterInfo(ScatterFunction::Min, false),
                          ScatterInfo(ScatterFunction::Add, false),
                          ScatterInfo(ScatterFunction::Update, false),
@@ -101,7 +108,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::PRECOMMIT, zip(
                          ScatterInfo(ScatterFunction::Update, false),
                          ScatterInfo(ScatterFunction::Update, false),
     }),
-    make("Expected", { false, true, true, false, false, false, false, false, false })),
+    make("Expected", { false, true, true, true, false, false, false, false, false, false })),
     input_info, updates_info, indices_info, output_info, scatter_info, expected)
 {
     const Status status = CLScatter::validate(&input_info, &updates_info, &indices_info, &output_info, scatter_info);
@@ -118,7 +125,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLScatterLayerFixture<float>, framework::Datase
         make("DataType", {DataType::F32}),
         allScatterFunctions,
         make("ZeroInit", {false}),
-        make("Inplace", {false})))
+        make("Inplace", {false}),
+        make("Padding", {true})))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -129,7 +137,8 @@ FIXTURE_DATA_TEST_CASE(RunSmallZeroInit, CLScatterLayerFixture<float>, framework
         make("DataType", {DataType::F32}),
         make("ScatterFunction", {ScatterFunction::Add}),
         make("ZeroInit", {true}),
-        make("Inplace", {false})))
+        make("Inplace", {false}),
+        make("Padding", {true})))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -140,7 +149,8 @@ FIXTURE_DATA_TEST_CASE(RunSmallMultiDim, CLScatterLayerFixture<float>, framework
         make("DataType", {DataType::F32}),
         allScatterFunctions,
         make("ZeroInit", {false}),
-        make("Inplace", {false})))
+        make("Inplace", {false}),
+        make("Padding", {true})))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -151,24 +161,136 @@ FIXTURE_DATA_TEST_CASE(RunSmallMultiIndices, CLScatterLayerFixture<float>, frame
         make("DataType", {DataType::F32}),
         make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add }),
         make("ZeroInit", {false}),
-        make("Inplace", {false, true})))
+        make("Inplace", {false, true}),
+        make("Padding", {true})))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
 // m+k, k-1-D m+n-D case
-FIXTURE_DATA_TEST_CASE(RunSmallBatchedMultiIndices, CLScatterLayerFixture<float>, framework::DatasetMode::DISABLED,
+FIXTURE_DATA_TEST_CASE(RunSmallBatchedMultiIndices, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
     combine(datasets::SmallScatterBatchedDataset(),
         make("DataType", {DataType::F32}),
-        make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add }),
+        make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add}),
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {true})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+// m+k, k-1-D m+n-D case
+FIXTURE_DATA_TEST_CASE(RunSmallScatterScalar, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterScalarDataset(),
+        make("DataType", {DataType::F32}),
+        make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add}),
         make("ZeroInit", {false}),
-        make("Inplace", {false})))
+        make("Inplace", {false}),
+        make("Padding", {false}))) // NOTE: Padding not supported in this datset
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
 TEST_SUITE_END() // FP32
+
+
+// NOTE: Padding is disabled for the SmallScatterMixedDataset due certain shapes not supporting padding.
+//       Padding is well tested in F32 Datatype test cases.
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::F16}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
+
+TEST_SUITE(Integer)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<int32_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::S32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // S32
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<int16_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::S16}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::S8}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // S8
+
+TEST_SUITE(U32)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<uint32_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::U32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // U32
+
+TEST_SUITE(U16)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<uint16_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::U16}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::U8}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // U8
+TEST_SUITE_END() // Integer
+
 TEST_SUITE_END() // Scatter
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 9b1da61ed7..d25f43a330 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -360,13 +360,21 @@ TEST_SUITE_END() // DynamicQuantization
 // Deqaunt tests involve returning F32 from the MatrixMultiplyCore kernels and is only implemented in aarch64
 TEST_SUITE(Dequant)
 constexpr AbsoluteTolerance<float> tolerance_dequantized(0.01f);
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallGEMMLowpDataset(),
+        make("accumulate", {true, false})
+    ))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_dequantized);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeGEMMLowpDataset(),
+        make("accumulate", {false})
+    ))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_dequantized);
diff --git a/tests/validation/NEON/ReorderLayer.cpp b/tests/validation/NEON/ReorderLayer.cpp
index 42fa0f8b00..839ad0ac92 100644
--- a/tests/validation/NEON/ReorderLayer.cpp
+++ b/tests/validation/NEON/ReorderLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,7 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ReorderFixture.h"
 #include "src/core/NEON/kernels/NEReorderKernel.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 
 namespace arm_compute
 {
@@ -40,6 +41,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 TEST_SUITE(NEON)
 TEST_SUITE(ReorderLayer)
 
@@ -48,13 +51,46 @@ using NEReorderLayerAlias = ReorderValidationFixture<Tensor, Accessor, NEReorder
 
 TEST_SUITE(FP32)
 #if defined(ARM_COMPUTE_ENABLE_SVE)
-FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), framework::dataset::make("DataType", DataType::F32)))
+DATA_TEST_CASE(ValidateReorderOHWIo8, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("InShape",{ TensorShape(10U, 9U), TensorShape(234U, 301U) }),
+                                                                     make("OutShape", { TensorShape(10U, 16U), TensorShape(234U, 304U) })
+                                                                    ),
+                                                                    zip(
+                                                                        make("InputWeightFormat", {WeightFormat::OHWI}),
+                                                                        make("OutputWeightFormat", {WeightFormat::OHWIo8})
+                                                                    )),
+            input_shape, output_shape,  input_wf,  output_wf)
+{
+    if(Scheduler::get().cpu_info().has_sve()){
+        arm_compute::NEReorderLayer reorder_layer;
+        int vector_length = arm_gemm::utils::get_vector_length<float>();
+        bool expected_bool_status = false;
+        if (vector_length == 8)
+        {
+            expected_bool_status = true;
+        }
+
+        TensorInfo input_tensor_info(input_shape, 1, DataType::F32);
+        TensorInfo output_tensor_info(output_shape, 1, DataType::F32);
+
+        Status status = reorder_layer.validate(&input_tensor_info, &output_tensor_info, input_wf, output_wf);
+
+        ARM_COMPUTE_EXPECT((expected_bool_status == bool(status)), framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), make("DataType", DataType::F32)))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    if (_hardware_supports)
+    {
+        validate(Accessor(_target), _reference);
+    }
 }
 #endif // ARM_COMPUTE_ENABLE_SVE
-FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), framework::dataset::make("DataType", DataType::F32)))
+
+FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -68,4 +104,4 @@ TEST_SUITE_END() // NEON
 } // namespace test
 } // namespace arm_compute
 
-#endif  // defined(__aarch64__)
-\ No newline at end of file
+#endif  // defined(__aarch64__)
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 8da5a0d953..94d0866c38 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -145,7 +145,7 @@ DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
     cpu_isa.fp16 = (data_type == DataType::F16);
 
     const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
-        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */},
+        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */, CPUInfo::get().get_sme2_vector_length()},
         cpu::KernelSelectionType::Preferred);
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
diff --git a/tests/validation/NEON/UNIT/RuntimeContext.cpp b/tests/validation/NEON/UNIT/RuntimeContext.cpp
index 819811943d..e0d45c639a 100644
--- a/tests/validation/NEON/UNIT/RuntimeContext.cpp
+++ b/tests/validation/NEON/UNIT/RuntimeContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,19 @@ namespace validation
 {
 TEST_SUITE(NEON)
 TEST_SUITE(UNIT)
+#if defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+TEST_CASE(CpuCapacity, framework::DatasetMode::ALL)
+{
+    CPUInfo& ci =  arm_compute::Scheduler::get().cpu_info();
+    const uint32_t nonlittle_num_cpus = ci.get_cpu_num_excluding_little();
+    const uint32_t num_threads = arm_compute::Scheduler::get().num_threads();
+
+    ARM_COMPUTE_EXPECT(num_threads<=nonlittle_num_cpus , framework::LogLevel::ERRORS);
+}
+#endif /* defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+
 TEST_SUITE(RuntimeContext)
 
 TEST_CASE(Scheduler, framework::DatasetMode::ALL)
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 6b7cbba92e..aa4eedb75d 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -472,15 +472,9 @@ template <typename TensorType, typename AccessorType, typename FunctionType, boo
 class GEMMLowpDequantizedMatrixMultiplyValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate)
     {
-        // Accumulation is supported for Int8/UInt8 only in aarch64
-        bool accumulate = true;
-        // Accumulation is not supported for Int8/UInt8 in aarch32
-#ifdef __arm__
-        accumulate = false;
-#endif //__arm__
-        bool dynamic_qinfo = false;
+        const bool dynamic_qinfo = false;
         const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
         const auto b_qinfo = QuantizationInfo(5.0f / 255, b_offset);
         TensorFillInfo finfo;
diff --git a/tests/validation/fixtures/ReorderFixture.h b/tests/validation/fixtures/ReorderFixture.h
index 36e62696bc..8e28484c48 100644
--- a/tests/validation/fixtures/ReorderFixture.h
+++ b/tests/validation/fixtures/ReorderFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE
-#define ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -32,6 +32,7 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/reference/Reorder.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 
 namespace arm_compute
 {
@@ -44,10 +45,23 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReorderValidationFixture : public framework::Fixture
 {
 public:
+    void check_hardware_supports(WeightFormat output_wf){
+        if(!Scheduler::get().cpu_info().has_sve() && output_wf!=WeightFormat::OHWIo4){
+            _hardware_supports = false;
+        }
+        if (Scheduler::get().cpu_info().has_sve() && arm_gemm::utils::get_vector_length<float>() != 8 && output_wf==WeightFormat::OHWIo8)
+        {
+            _hardware_supports = false;
+        }
+    }
+
     void setup(TensorShape input_shape, TensorShape output_shape, WeightFormat input_wf, WeightFormat output_wf, DataType data_type)
     {
-        _target    = compute_target(input_shape, output_shape, input_wf, output_wf, data_type);
-        _reference = compute_reference(input_shape, output_shape, output_wf, data_type);
+        check_hardware_supports(output_wf);
+        if (_hardware_supports){
+            _target    = compute_target(input_shape, output_shape, input_wf, output_wf, data_type);
+            _reference = compute_reference(input_shape, output_shape, output_wf, data_type);
+        }
     }
 
     protected:
@@ -98,6 +112,7 @@ public:
         return reference::reorder_layer<T>(src, output_shape, output_wf);
     }
 
+    bool _hardware_supports = true;
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
@@ -105,4 +120,4 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE_H
diff --git a/tests/validation/fixtures/ScatterLayerFixture.h b/tests/validation/fixtures/ScatterLayerFixture.h
index 4fb2d7f127..af161ef98b 100644
--- a/tests/validation/fixtures/ScatterLayerFixture.h
+++ b/tests/validation/fixtures/ScatterLayerFixture.h
@@ -48,7 +48,7 @@ class ScatterGenericValidationFixture : public framework::Fixture
 {
 public:
     void setup(TensorShape src_shape, TensorShape updates_shape, TensorShape indices_shape,
-        TensorShape out_shape, DataType data_type, ScatterInfo scatter_info, bool inplace,
+        TensorShape out_shape, DataType data_type, ScatterInfo scatter_info, bool inplace, bool padding,
         QuantizationInfo src_qinfo = QuantizationInfo(), QuantizationInfo o_qinfo = QuantizationInfo())
     {
         // this is for improving randomness across tests
@@ -57,19 +57,36 @@ public:
               + updates_shape[4] + updates_shape[5]
               + indices_shape[0] + indices_shape[1] + indices_shape[2] + indices_shape[3];
 
-        _target    = compute_target(src_shape, updates_shape, indices_shape,  out_shape, data_type, scatter_info, inplace, src_qinfo, o_qinfo);
+        _target    = compute_target(src_shape, updates_shape, indices_shape,  out_shape, data_type, scatter_info, inplace, padding, src_qinfo, o_qinfo);
         _reference = compute_reference(src_shape, updates_shape, indices_shape,  out_shape, data_type,scatter_info, src_qinfo , o_qinfo);
     }
 
 protected:
     template <typename U>
-    void fill(U &&tensor, int i, float lo = -10.f, float hi = 10.f)
+    void fill(U &&tensor, int i)
     {
         switch(tensor.data_type())
         {
             case DataType::F32:
+            case DataType::F16:
             {
-                std::uniform_real_distribution<float> distribution(lo, hi);
+                std::uniform_real_distribution<float> distribution(-10.f, 10.f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::S32:
+            case DataType::S16:
+            case DataType::S8:
+            {
+                std::uniform_int_distribution<int32_t> distribution(-100, 100);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::U32:
+            case DataType::U16:
+            case DataType::U8:
+            {
+                std::uniform_int_distribution<uint32_t> distribution(0, 200);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -86,12 +103,12 @@ protected:
     void fill_indices(U &&tensor, int i, const TensorShape &shape)
     {
         // Calculate max indices the shape should contain. Add an arbitrary value to allow testing for some out of bounds values (In this case min dimension)
-        const int32_t max = std::max({shape[0] , shape[1], shape[2]});
-        library->fill_tensor_uniform(tensor, i, static_cast<int32_t>(-2), static_cast<int32_t>(max));
+        const int32_t max = std::min({shape[0] , shape[1], shape[2]}) + 1;
+        library->fill_tensor_uniform(tensor, i, static_cast<int32_t>(0), static_cast<int32_t>(max));
     }
 
     TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c,
-        const TensorShape &out_shape, DataType data_type, const ScatterInfo info, bool inplace,
+        const TensorShape &out_shape, DataType data_type, const ScatterInfo info, bool inplace, bool padding,
         QuantizationInfo a_qinfo, QuantizationInfo o_qinfo)
     {
         // 1. Create relevant tensors using ScatterInfo data structure.
@@ -129,11 +146,14 @@ protected:
         ARM_COMPUTE_ASSERT(indices.info()->is_resizable());
         ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
-        add_padding_x({ &src, &updates, &indices});
-
-        if(!inplace)
+        if(padding)
         {
-            add_padding_x({ &dst });
+            add_padding_x({ &src, &updates, &indices});
+
+            if(!inplace)
+            {
+                add_padding_x({ &dst });
+            }
         }
 
         // Allocate tensors
@@ -180,12 +200,13 @@ protected:
         TensorShape src_shape = a_shape;
         TensorShape updates_shape = b_shape;
         TensorShape indices_shape = c_shape;
+        const int num_ind_dims = c_shape.num_dimensions();
 
         // 1. Collapse batch index into a single dim if necessary for update tensor and indices tensor.
-        if(c_shape.num_dimensions() >= 3)
+        if(num_ind_dims >= 3)
         {
             indices_shape = indices_shape.collapsed_from(1);
-            updates_shape = updates_shape.collapsed_from(updates_shape.num_dimensions() - 2); // Collapses from last 2 dims
+            updates_shape = updates_shape.collapsed_from(updates_shape.num_dimensions() - (num_ind_dims -1)); // Collapses batch dims
         }
 
         // 2. Collapse data dims into a single dim.
@@ -195,16 +216,16 @@ protected:
         updates_shape.collapse(updates_shape.num_dimensions() - 1); // Collapse data dims (all except last dim which is batch dim)
 
         // Create reference tensors
-        SimpleTensor<T> src{ a_shape, data_type, 1, a_qinfo };
-        SimpleTensor<T> updates{b_shape, data_type, 1, QuantizationInfo() };
-        SimpleTensor<int32_t> indices{ c_shape, DataType::S32, 1, QuantizationInfo() };
+        SimpleTensor<T> src{ src_shape, data_type, 1, a_qinfo };
+        SimpleTensor<T> updates{updates_shape, data_type, 1, QuantizationInfo() };
+        SimpleTensor<int32_t> indices{ indices_shape, DataType::S32, 1, QuantizationInfo() };
 
         // Fill reference
         fill(src, 0 + _hash);
         fill(updates, 1 + _hash);
         fill_indices(indices, 2 + _hash, out_shape);
 
-        // Calculate individual reference.
+        // Calculate individual reference using collapsed shapes
         return reference::scatter_layer<T>(src, updates, indices, out_shape, info);
     }
 
@@ -219,10 +240,10 @@ class ScatterValidationFixture : public ScatterGenericValidationFixture<TensorTy
 {
 public:
     void setup(TensorShape src_shape, TensorShape update_shape, TensorShape indices_shape,
-        TensorShape out_shape, DataType data_type, ScatterFunction func, bool zero_init, bool inplace)
+        TensorShape out_shape, DataType data_type, ScatterFunction func, bool zero_init, bool inplace, bool padding)
     {
         ScatterGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, update_shape,
-            indices_shape, out_shape, data_type, ScatterInfo(func, zero_init), inplace,
+            indices_shape, out_shape, data_type, ScatterInfo(func, zero_init), inplace, padding,
             QuantizationInfo(), QuantizationInfo());
     }
 };
diff --git a/tests/validation/reference/ScatterLayer.cpp b/tests/validation/reference/ScatterLayer.cpp
index 283022e8e2..55c48a9002 100644
--- a/tests/validation/reference/ScatterLayer.cpp
+++ b/tests/validation/reference/ScatterLayer.cpp
@@ -63,6 +63,7 @@ T reduce_op(const T &current,const T &update,const ScatterFunction func)
 }
 
 template float reduce_op(const float &current,const float &update,const ScatterFunction func);
+template half reduce_op(const half &current,const half &update,const ScatterFunction func);
 }
 
 // NOTE: This function expects collapsed tensors as input.
@@ -138,7 +139,13 @@ SimpleTensor<T> scatter_layer(const SimpleTensor<T> &src, const SimpleTensor<T>
 }
 
 template SimpleTensor<float> scatter_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
-
+template SimpleTensor<half> scatter_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int32_t> scatter_layer(const SimpleTensor<int32_t> &src, const SimpleTensor<int32_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint32_t> scatter_layer(const SimpleTensor<uint32_t> &src, const SimpleTensor<uint32_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int16_t> scatter_layer(const SimpleTensor<int16_t> &src, const SimpleTensor<int16_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint16_t> scatter_layer(const SimpleTensor<uint16_t> &src, const SimpleTensor<uint16_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int8_t> scatter_layer(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint8_t> scatter_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
 } // namespace reference
 } // namespace validation
 } // namespace test