22 files changed, 1071 insertions, 298 deletions
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index 1ae9e96626..78d794a9bb 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -71,7 +71,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFixture, framework:
 }
 
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
-    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
 TEST_SUITE(BatchedMatMul)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
@@ -84,7 +84,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFi
 TEST_SUITE_END() // QASYMM8
 
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
-    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
     combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
@@ -98,7 +98,7 @@ TEST_SUITE_END() // BatchedMatMul
 
 TEST_SUITE(FusedOffsetOutput)
 TEST_SUITE(QASYMM8)
-using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
+using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture, framework::DatasetMode::ALL,
     combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
         make("DataType", { DataType::QASYMM8 }),
@@ -110,7 +110,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUi
 
 TEST_SUITE(Output3D)
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputOutput3DUint8Fixture =
-    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, true>;
+    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, true>;
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputOutput3DUint8Fixture, framework::DatasetMode::ALL,
     combine(datasets::SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset(),
         make("DataType", { DataType::QASYMM8 }),
@@ -123,7 +123,7 @@ TEST_SUITE_END() // Output3D
 
 TEST_SUITE(InputOutput3D)
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInputOutput3DUint8Fixture =
-    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, true, true>;
+    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, true, true>;
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInputOutput3DUint8Fixture, framework::DatasetMode::ALL,
     combine(datasets::SmallGEMMLowpFusedOffsetOutputInputOutput3DUint8Dataset(),
         make("DataType", { DataType::QASYMM8 }),
@@ -148,7 +148,8 @@ using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture =
     GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t>;
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputInt8Fixture, framework::DatasetMode::ALL,
     combine(datasets::SmallGEMMLowpFusedOffsetOutputInt8Dataset(),
-        make("DataType", { DataType::QASYMM8_SIGNED })))
+        make("DataType", { DataType::QASYMM8_SIGNED }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_quant);
diff --git a/tests/validation/CL/ScatterLayer.cpp b/tests/validation/CL/ScatterLayer.cpp
index 56338f489f..b1531eb64a 100644
--- a/tests/validation/CL/ScatterLayer.cpp
+++ b/tests/validation/CL/ScatterLayer.cpp
@@ -38,6 +38,12 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
+RelativeTolerance<float> tolerance_f16(0.02f); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
+RelativeTolerance<int32_t> tolerance_int(0); /**< Tolerance value for comparing reference's output against implementation's output for integer data types */
+} // namespace
 
 template <typename T>
 using CLScatterLayerFixture = ScatterValidationFixture<CLTensor, CLAccessor, CLScatter, T>;
@@ -46,69 +52,245 @@ using framework::dataset::make;
 
 TEST_SUITE(CL)
 TEST_SUITE(Scatter)
-DATA_TEST_CASE(Validate, framework::DatasetMode::DISABLED, zip(
+DATA_TEST_CASE(Validate, framework::DatasetMode::PRECOMMIT, zip(
     make("InputInfo", { TensorInfo(TensorShape(9U), 1, DataType::F32),    // Mismatching data types
-                                            TensorInfo(TensorShape(15U), 1, DataType::F32), // Valid
-                                            TensorInfo(TensorShape(8U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(217U), 1, DataType::F32),    // Mismatch input/output dims.
-                                            TensorInfo(TensorShape(217U), 1, DataType::F32),    // Updates dim higher than Input/Output dims.
-                                            TensorInfo(TensorShape(12U), 1, DataType::F32),      // Indices wrong datatype.
-                                          }),
-    make("UpdatesInfo",{                    TensorInfo(TensorShape(3U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(15U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(217U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(217U, 3U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                          }),
-    make("IndicesInfo",{                  TensorInfo(TensorShape(3U), 1, DataType::U32),
-                                          TensorInfo(TensorShape(15U), 1, DataType::U32),
-                                          TensorInfo(TensorShape(2U), 1, DataType::U32),
-                                          TensorInfo(TensorShape(271U), 1, DataType::U32),
-                                          TensorInfo(TensorShape(271U), 1, DataType::U32),
-                                          TensorInfo(TensorShape(2U), 1 , DataType::S32)
-                                          }),
-    make("OutputInfo",{                     TensorInfo(TensorShape(9U), 1, DataType::F16),
-                                            TensorInfo(TensorShape(15U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(8U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(271U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(12U), 1, DataType::F32)
-                                           }),
+                        TensorInfo(TensorShape(15U), 1, DataType::F32),   // Valid
+                        TensorInfo(TensorShape(15U), 1, DataType::U8),   // Valid
+                        TensorInfo(TensorShape(8U), 1, DataType::F32),
+                        TensorInfo(TensorShape(217U), 1, DataType::F32),    // Mismatch input/output dims.
+                        TensorInfo(TensorShape(217U), 1, DataType::F32),    // Updates dim higher than Input/Output dims.
+                        TensorInfo(TensorShape(12U), 1, DataType::F32),     // Indices wrong datatype.
+                        TensorInfo(TensorShape(9U, 3U, 4U), 1, DataType::F32), // Number of updates != number of indices
+                        TensorInfo(TensorShape(17U, 3U, 3U, 2U), 1, DataType::F32), // index_len != (dst_dims - upt_dims + 1)
+                        TensorInfo(TensorShape(17U, 3U, 3U, 2U, 2U, 2U), 1, DataType::F32), // index_len > 5
+    }),
+    make("UpdatesInfo",{TensorInfo(TensorShape(3U), 1, DataType::F16),
+                        TensorInfo(TensorShape(15U), 1, DataType::F32),
+                        TensorInfo(TensorShape(15U), 1, DataType::U8),
+                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                        TensorInfo(TensorShape(217U), 1, DataType::F32),
+                        TensorInfo(TensorShape(217U, 3U), 1, DataType::F32),
+                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                        TensorInfo(TensorShape(9U, 3U, 2U), 1, DataType::F32),
+                        TensorInfo(TensorShape(17U, 3U, 2U), 1, DataType::F32),
+                        TensorInfo(TensorShape(1U), 1, DataType::F32),
+    }),
+    make("IndicesInfo",{TensorInfo(TensorShape(1U, 3U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 15U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 15U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 2U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 271U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 271U), 1, DataType::S32),
+                        TensorInfo(TensorShape(1U, 2U), 1 , DataType::F32),
+                        TensorInfo(TensorShape(1U, 4U), 1, DataType::S32),
+                        TensorInfo(TensorShape(3U, 2U), 1, DataType::S32),
+                        TensorInfo(TensorShape(6U, 2U), 1, DataType::S32),
+    }),
+    make("OutputInfo",{TensorInfo(TensorShape(9U), 1, DataType::F16),
+                       TensorInfo(TensorShape(15U), 1, DataType::F32),
+                       TensorInfo(TensorShape(15U), 1, DataType::U8),
+                       TensorInfo(TensorShape(8U), 1, DataType::F32),
+                       TensorInfo(TensorShape(271U, 3U), 1, DataType::F32),
+                       TensorInfo(TensorShape(271U), 1, DataType::F32),
+                       TensorInfo(TensorShape(12U), 1, DataType::F32),
+                       TensorInfo(TensorShape(9U, 3U, 4U), 1, DataType::F32),
+                       TensorInfo(TensorShape(17U, 3U, 3U, 2U), 1, DataType::F32),
+                       TensorInfo(TensorShape(17U, 3U, 3U, 2U, 2U, 2U), 1, DataType::F32),
+    }),
     make("ScatterInfo",{ ScatterInfo(ScatterFunction::Add, false),
-                                           }),
-    make("Expected", { false, true, true, false, false, false })),
+                         ScatterInfo(ScatterFunction::Max, false),
+                         ScatterInfo(ScatterFunction::Max, false),
+                         ScatterInfo(ScatterFunction::Min, false),
+                         ScatterInfo(ScatterFunction::Add, false),
+                         ScatterInfo(ScatterFunction::Update, false),
+                         ScatterInfo(ScatterFunction::Sub, false),
+                         ScatterInfo(ScatterFunction::Sub, false),
+                         ScatterInfo(ScatterFunction::Update, false),
+                         ScatterInfo(ScatterFunction::Update, false),
+    }),
+    make("Expected", { false, true, true, true, false, false, false, false, false, false })),
     input_info, updates_info, indices_info, output_info, scatter_info, expected)
 {
-    // TODO: Enable validation tests.
-    ARM_COMPUTE_UNUSED(input_info);
-    ARM_COMPUTE_UNUSED(updates_info);
-    ARM_COMPUTE_UNUSED(indices_info);
-    ARM_COMPUTE_UNUSED(output_info);
-    ARM_COMPUTE_UNUSED(scatter_info);
-    ARM_COMPUTE_UNUSED(expected);
+    const Status status = CLScatter::validate(&input_info, &updates_info, &indices_info, &output_info, scatter_info);
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
 }
 
+const auto allScatterFunctions = make("ScatterFunction",
+    {ScatterFunction::Update, ScatterFunction::Add, ScatterFunction::Sub, ScatterFunction::Min, ScatterFunction::Max });
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small1DScatterDataset(),
-                                                                                                                    make("DataType", {DataType::F32}),
-                                                                                                                    make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add, ScatterFunction::Sub, ScatterFunction::Min, ScatterFunction::Max}),
-                                                                                                                    make("ZeroInit", {false})))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small1DScatterDataset(),
+        make("DataType", {DataType::F32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {true})))
 {
-    // TODO: Add validate() here.
+    validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
 // With this test, src should be passed as nullptr.
-FIXTURE_DATA_TEST_CASE(RunSmallZeroInit, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small1DScatterDataset(),
-                                                                                                                    make("DataType", {DataType::F32}),
-                                                                                                                    make("ScatterFunction", {ScatterFunction::Add}),
-                                                                                                                    make("ZeroInit", {true})))
+FIXTURE_DATA_TEST_CASE(RunSmallZeroInit, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::Small1DScatterDataset(),
+        make("DataType", {DataType::F32}),
+        make("ScatterFunction", {ScatterFunction::Add}),
+        make("ZeroInit", {true}),
+        make("Inplace", {false}),
+        make("Padding", {true})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+// Updates/src/dst have same no. dims.
+FIXTURE_DATA_TEST_CASE(RunSmallMultiDim, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMultiDimDataset(),
+        make("DataType", {DataType::F32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {true})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+// m+1-D to m+n-D cases
+FIXTURE_DATA_TEST_CASE(RunSmallMultiIndices, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMultiIndicesDataset(),
+        make("DataType", {DataType::F32}),
+        make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add }),
+        make("ZeroInit", {false}),
+        make("Inplace", {false, true}),
+        make("Padding", {true})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+// m+k, k-1-D m+n-D case
+FIXTURE_DATA_TEST_CASE(RunSmallBatchedMultiIndices, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterBatchedDataset(),
+        make("DataType", {DataType::F32}),
+        make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add}),
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {true})))
 {
-    // TODO: Add validate() here
+    validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
+// m+k, k-1-D m+n-D case
+FIXTURE_DATA_TEST_CASE(RunSmallScatterScalar, CLScatterLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterScalarDataset(),
+        make("DataType", {DataType::F32}),
+        make("ScatterFunction", {ScatterFunction::Update, ScatterFunction::Add}),
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false}))) // NOTE: Padding not supported in this datset
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
 TEST_SUITE_END() // FP32
+
+
+// NOTE: Padding is disabled for the SmallScatterMixedDataset due certain shapes not supporting padding.
+//       Padding is well tested in F32 Datatype test cases.
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::F16}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
+
+TEST_SUITE(Integer)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<int32_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::S32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // S32
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<int16_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::S16}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE(S8)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::S8}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // S8
+
+TEST_SUITE(U32)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<uint32_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::U32}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // U32
+
+TEST_SUITE(U16)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<uint16_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::U16}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmallMixed, CLScatterLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallScatterMixedDataset(),
+        make("DataType", {DataType::U8}),
+        allScatterFunctions,
+        make("ZeroInit", {false}),
+        make("Inplace", {false}),
+        make("Padding", {false})))
+{
+    validate(CLAccessor(_target), _reference, tolerance_int);
+}
+TEST_SUITE_END() // U8
+TEST_SUITE_END() // Integer
+
 TEST_SUITE_END() // Scatter
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/CPP/DFT.cpp b/tests/validation/CPP/DFT.cpp
index e19e850589..84431399be 100644
--- a/tests/validation/CPP/DFT.cpp
+++ b/tests/validation/CPP/DFT.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -125,7 +125,7 @@ DATA_TEST_CASE(Real, framework::DatasetMode::ALL, shapes_2d_dft,
     auto backward = reference::ridft_2d(forward, is_odd);
 
     // Validate with input
-    validate(SimpleTensorAccessor<float>(src), backward, RelativeTolerance<float>(0.1f));
+    validate(SimpleTensorAccessor<float>(src), backward, RelativeTolerance<float>(0.1f), 0.f, AbsoluteTolerance<float>(0.001f));
 }
 
 DATA_TEST_CASE(Complex, framework::DatasetMode::ALL, shapes_2d_dft,
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 7a9230d37a..d739d4e1a4 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -767,21 +767,33 @@ FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer, HasOptImplFixtureNoFastMath
 }
 
 #if defined(ARM_COMPUTE_ENABLE_BF16)
-
+// These tests currently only works with SVE length 256
+// If other SVE length is used a kernel will fail to be found
+// This needs to be addressed in order to ensure it doesn't revert to FP32 kernels for systems with SVE length other than 256
 FIXTURE_DATA_TEST_CASE(UC2_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<cpu::CpuGemmConv2d>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
 {
-    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT_EQUAL(_computed_weight_format, arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    if(Scheduler::get().cpu_info().has_bf16() && (arm_gemm::utils::get_vector_length<float>() == 8)){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT_EQUAL(_computed_weight_format, arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(UC2_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::OHWIo8i4_bf16 })))
 {
-    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    if(Scheduler::get().cpu_info().has_bf16() && (arm_gemm::utils::get_vector_length<float>() == 8)){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format == arm_compute::WeightFormat::OHWIo8i4_bf16, framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(!_kernel_found, framework::LogLevel::ERRORS);
+    }
 }
 
 #endif // ARM_COMPUTE_ENABLE_BF16
@@ -852,20 +864,36 @@ FIXTURE_DATA_TEST_CASE(UC3_2_CpuGemmConv2d_FastMath, HasOptImplFixtureFastMath<c
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
 {
-    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    if(Scheduler::get().cpu_info().has_bf16()){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(UC3_2_NEGEMMConvolutionLayer_FastMath, HasOptImplFixtureFastMath<NEGEMMConvolutionLayer>, framework::DatasetMode::ALL,
                        combine(framework::dataset::make("DataType", { DataType::F32 }),
                                framework::dataset::make("QueryWeightFormat", { arm_compute::WeightFormat::ANY })))
 {
-    ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    if(Scheduler::get().cpu_info().has_bf16()){
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
+    else{
+        ARM_COMPUTE_EXPECT(_kernel_found, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::ANY, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(_computed_weight_format != arm_compute::WeightFormat::UNSPECIFIED, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!arm_compute::is_fixed_format_fast_math(_computed_weight_format), framework::LogLevel::ERRORS);
+    }
 }
 
 #endif // ARM_COMPUTE_ENABLE_BF16
@@ -1141,7 +1169,7 @@ TEST_SUITE(Float)
 TEST_SUITE(BFLOAT16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                    framework::dataset::make("DataType", DataType::BFLOAT16)),
+                                                                                                                    framework::dataset::make("DataType", Scheduler::get().cpu_info().has_bf16() ? DataType::BFLOAT16 : DataType::F32)),
                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
                                                                                                             ActivationFunctionsDataset))
 {
@@ -1329,6 +1357,27 @@ FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEGEMMConvolutionLayerQuantizedPerChannel
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+
+FIXTURE_DATA_TEST_CASE(MemoryStressLargeChannels, NEGEMMConvolutionLayerQuantizedPerChannelFixture<int8_t>,
+    framework::DatasetMode::ALL,
+        combine(
+            make("In", TensorShape(1U)),
+            make("Weights", TensorShape(1U, 1U, 1U, 17000U)),
+            make("Biases", TensorShape(17000U)),
+            make("Out", TensorShape(1U, 1U, 17000U)),
+            make("Info", PadStrideInfo(1, 1, 0, 0)),
+            make("Dilation", Size2D(1, 1)),
+            make("ReshapeWeights", { true }),
+            make("DataType", { DataType::QASYMM8_SIGNED }),
+            make("DataLayout", { DataLayout::NHWC }),
+            make("QuantizationInfo", QuantizationInfo(0.5f, 10)),
+            make("ActivationInfo", ActivationLayerInfo()),
+            make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index f956cdfeda..5f6a402204 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace
 {
 constexpr AbsoluteTolerance<float> tolerance_f(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */
@@ -60,7 +62,7 @@ const AbsoluteTolerance<float>      abs_tolerance_f16(0.2f);      /**< Absolute
 constexpr float                     tolerance_num = 0.07f;        /**< Tolerance number for FP16 data types */
 #endif                                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 /** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     DataType::F16,
@@ -68,8 +70,8 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
     DataType::F32,
 });
 
-const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12);
-const auto data_transpose  = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);
+const auto data_interleave = make("M", 8, 12) * make("N", 8, 12);
+const auto data_transpose  = make("M", 8, 14) * make("N", 7, 14);
 
 /** Zero padding test */
 template <typename FunctionType>
@@ -204,16 +206,16 @@ TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type
+               make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type
                                                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),
                                                      }),
-               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32),
+               make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32),
                                                         TensorInfo(TensorShape(8U, 27U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32),
+               make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32),
                                                         TensorInfo(TensorShape(8U, 13U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { false, true })),
+               make("Expected", { false, true })),
                lhs_info, rhs_info, output_info, expected)
 {
     constexpr float alpha = 1.0;
@@ -226,8 +228,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 TEST_SUITE(KERNEL_SELECTION)
 DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL,
-               combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                       framework::dataset::make("DataType", { DataType::F32,
+               combine(make("CpuExt", std::string("NEON")),
+                       make("DataType", { DataType::F32,
                                                               DataType::F16
                                                             })),
                cpu_ext, data_type)
@@ -261,8 +263,8 @@ TEST_SUITE_END() // KERNEL_SELECTION
 TEST_SUITE(TRANSPOSE_1XW)
 using CpuGemmTranspose1xW = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmTranspose1xWKernel>;
 DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
-                   framework::dataset::make("N", { 1, 23, 63, 101 }),
-                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+                   make("N", { 1, 23, 63, 101 }),
+                   make("K", { 1, 47, 29, 27 })),
                n_value, k_value)
 {
     bool status = validate_zero_padding<CpuGemmTranspose1xW>(n_value, k_value);
@@ -271,7 +273,7 @@ DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
 
 TEST_SUITE(U32)
 using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint32_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U32))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -280,7 +282,7 @@ TEST_SUITE_END() // U32
 
 TEST_SUITE(U16)
 using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint16_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U16))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U16))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -289,7 +291,7 @@ TEST_SUITE_END() // U16
 
 TEST_SUITE(U8)
 using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint8_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U8))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U8))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -302,8 +304,8 @@ TEST_SUITE(INTERLEAVE_4X4)
 using CpuGemmInterleave4x4 = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmInterleave4x4Kernel>;
 
 DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
-                   framework::dataset::make("M", { 1, 23, 63, 101 }),
-                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+                   make("M", { 1, 23, 63, 101 }),
+                   make("K", { 1, 47, 29, 27 })),
                m_value, k_value)
 {
     bool status = validate_zero_padding<cpu::kernels::CpuGemmInterleave4x4Kernel>(m_value, k_value);
@@ -312,7 +314,7 @@ DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
 
 TEST_SUITE(U32)
 using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint32_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U32))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -321,7 +323,7 @@ TEST_SUITE_END() // U32
 
 TEST_SUITE(U16)
 using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint16_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U16))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U16))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -330,7 +332,7 @@ TEST_SUITE_END() // U16
 
 TEST_SUITE(U8)
 using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint8_t>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::QASYMM8))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::QASYMM8))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -345,15 +347,18 @@ using NEGEMMFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T>;
 template <typename T>
 using NEBatchedMatMulFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true, false, false, false, false, true>;
 
+template <typename T>
+using NEGEMMAccumulateFixture = GEMMAccumulateValidationFixture<Tensor, Accessor, NEGEMM, T>;
+
 TEST_SUITE(Float)
-DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(framework::dataset::make("In0", { TensorShape(21U, 13U),
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(make("In0", { TensorShape(21U, 13U),
                                                                                                        TensorShape(31U, 1U),
                                                                                                        TensorShape(31U, 1U),
                                                                                                        TensorShape(8U, 2U),
                                                                                                        TensorShape(38U, 12U),
                                                                                                        TensorShape(32U, 1U)
                                                                                                      }),
-                                                                     framework::dataset::make("In1", { TensorShape(33U, 21U),
+                                                                     make("In1", { TensorShape(33U, 21U),
                                                                                                        TensorShape(23U, 31U),
                                                                                                        TensorShape(23U, 31U),
                                                                                                        TensorShape(16U, 8U),
@@ -366,75 +371,111 @@ DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(framework::
     ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
+DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
+                                                                     zip(make("In0",{ TensorShape(21U, 13U) }),
+                                                                     make("In1", { TensorShape(33U, 21U) }),
+                                                                     make("Dst", { TensorShape(33U, 13U) })),
+                                                                     zip(
+                                                                     make("alpha", { 1.0, 100.0, 1.0, 1.0 }),
+                                                                     make("beta", { 0.0, 0.0, 1.0, 1.0 }),
+                                                                     make("is_c_null", { false, false, false, true }),
+                                                                     make("Expected", { true, false, false, true }))),
+               shape_a, shape_b, shape_dst, alpha, beta, is_c_null, expected)
+{
+    /* Accumulation test for GEMM kernels */
+    // Create tensors
+    TensorInfo in_a(shape_a, 1, DataType::F32);
+    TensorInfo in_b(shape_b, 1, DataType::F32);
+    TensorInfo in_c(shape_dst, 1, DataType::F32);
+    TensorInfo dst(shape_dst, 1, DataType::F32);
+
+    GEMMInfo gemm_info = GEMMInfo();
+    gemm_info.set_accumulate(true);
+
+    // Validate accumulation
+    cpu::CpuGemm gemm;
+    Status status = gemm.validate(&in_a, &in_b, (is_c_null ? nullptr : &in_c), &dst, alpha, beta, gemm_info);
+    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                         framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                 framework::dataset::make("DataType", DataType::F16)))
+                                                                                                         make("ReshapeWeights", { true, false })),
+                                                                                                 make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
 }
-
-TEST_SUITE(BATCHED_MATMUL)
-
-FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
-                                                                                                                  framework::dataset::make("ReshapeWeights", { false })),
-                                                                                                          framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
+                                                                                                       make("ReshapeWeights", { true, false })),
+                                                                                               make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
 }
-TEST_SUITE_END()
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
-                                                                                                       framework::dataset::make("ReshapeWeights", { true, false })),
-
-                                                                                               framework::dataset::make("DataType", DataType::F16)))
+TEST_SUITE(BATCHED_MATMUL)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
+                                                                                                                  make("ReshapeWeights", { false })),
+                                                                                                          make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // BATCHED_MATMUL
+
+TEST_SUITE_END() // FP16
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
-                                                                                                          framework::dataset::make("ReshapeWeights", { true, false })),
-
-                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+                                                                                                          make("ReshapeWeights", { true, false })),
+                                                                                                  make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
-                                                                                                        framework::dataset::make("ReshapeWeights", { true, false })),
-
-                                                                                                framework::dataset::make("DataType", DataType::F32)))
+                                                                                                        make("ReshapeWeights", { true, false })),
+                                                                                                make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
 
 TEST_SUITE(BATCHED_MATMUL)
-
-TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { false })),
-                                                                                                           framework::dataset::make("DataType", DataType::F32)))
+                                                                                                                   make("ReshapeWeights", { false })),
+                                                                                                           make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // BATCHED_MATMUL
 
-TEST_SUITE_END()
+TEST_SUITE(ACCUMULATE)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAccumulateFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallAccumulateGEMMDataset(),
+                                                                                                        make("ReshapeWeights", { false }),
+                                                                                                        make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMAccumulateFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeAccumulateGEMMDataset(),
+                                                                                                        make("ReshapeWeights", { false }),
+                                                                                                        make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f);
+}
+TEST_SUITE_END() // ACCUMULATE
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // GEMM
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 9c4d1741eb..d25f43a330 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -47,12 +47,24 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
+namespace
+{
+    constexpr AbsoluteTolerance<float> tolerance_batched(1);
+    constexpr AbsoluteTolerance<float> tolerance_quant(1);
+} // namespace
+
+
 TEST_SUITE(NEON)
 TEST_SUITE(GEMMLowp)
 TEST_SUITE(MatrixMultiplyCore)
 
 using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpMatrixMultiplyCoreAccumulateFixture = GEMMLowpMatrixMultiplyAccumulateValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 using NEGEMMLowpBatchedMatMulFixture      = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, true>;
+using NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture = GEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpDequantizedMatrixMultiplyValidationFixture = GEMMLowpDequantizedMatrixMultiplyValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 
 using framework::dataset::make;
 
@@ -80,6 +92,46 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::c
     validate(b.info()->padding(), PaddingSize());
     validate(c.info()->padding(), PaddingSize());
 }
+// accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __aarch64__
+DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("In0",{ TensorShape(21U, 1U) }),
+                                                                     make("In1", { TensorShape(1U, 21U) }),
+                                                                     make("Dst", { TensorShape(1U, 1U) }),
+                                                                     make("a_offset", { -2 }),
+                                                                     make("a_offset", { 13 })
+                                                                    ),
+                                                                    zip(
+                                                                     make("OutputDataType", {  DataType::S32,  DataType::QASYMM8, DataType::QASYMM8_SIGNED}),
+                                                                     make("Expected", { true, false, false })
+                                                                    )),
+               shape_a, shape_b, shape_dst, a_offset, b_offset, output_data_type, expected)
+{
+    DataType input_data_type = (output_data_type == DataType::S32 ? DataType::QASYMM8 : output_data_type);
+    // Accumulation test for GEMM kernels
+    TensorInfo a(shape_a, 1, input_data_type, QuantizationInfo(1.0f / 255, a_offset));
+    TensorInfo b(shape_b, 1, input_data_type, QuantizationInfo(1.0f / 255, b_offset));
+    TensorInfo dst(shape_dst, 1, output_data_type, QuantizationInfo());
+
+    // Create and configure function
+    GEMMInfo gemm_info = GEMMInfo();
+    gemm_info.set_accumulate(true);
+
+    if (is_data_type_quantized(output_data_type))
+    {
+        GEMMLowpOutputStageInfo gemmLowpOutputStageInfo = GEMMLowpOutputStageInfo();
+        gemmLowpOutputStageInfo.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+
+        gemm_info.set_gemmlowp_output_stage(gemmLowpOutputStageInfo);
+    }
+
+    cpu::CpuGemmLowpMatrixMultiplyCore gemmlowp_mm;
+    Status status = gemmlowp_mm.validate(&a, &b, nullptr, &dst, gemm_info);
+
+    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
+}
+#endif // __arch64__
 
 // *INDENT-OFF*
 // clang-format off
@@ -226,13 +278,10 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework:
     validate(Accessor(_target), _reference);
 }
 
-constexpr AbsoluteTolerance<float> tolerance_batched(1);
-
-using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
-    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
-
 TEST_SUITE(BatchedMatMul)
 TEST_SUITE(QASYMM8)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
     combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
         make("DataType", { DataType::QASYMM8 }),
@@ -242,9 +291,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFi
 }
 TEST_SUITE_END() // QASYMM8
 
-using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
-    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
 TEST_SUITE(QASYMM8_SIGNED)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
+    GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
     combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),
         make("DataType", { DataType::QASYMM8_SIGNED }),
@@ -255,26 +304,84 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFi
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // BatchedMatMul
 
-using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
-constexpr AbsoluteTolerance<float> tolerance_quant(1);
-
 TEST_SUITE(FusedOffsetOutput)
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL,
     combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
-        make("DataType", { DataType::QASYMM8 })))
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_quant);
 }
-
 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY,
     combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
-        make("DataType", { DataType::QASYMM8 })))
+        make("DataType", { DataType::QASYMM8 }),
+        make("reshape_b_only_on_first_run", { false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_quant);
 }
 TEST_SUITE_END() // FusedOffsetOutput
+
+// accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __aarch64__
+TEST_SUITE(ACCUMULATION)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreAccumulateFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreAccumulateFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+TEST_SUITE_END() // ACCUMULATION
+#endif // __arch64__
+
+TEST_SUITE(DynamicQuantization)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // DynamicQuantization
+
+#ifdef __aarch64__
+// Deqaunt tests involve returning F32 from the MatrixMultiplyCore kernels and is only implemented in aarch64
+TEST_SUITE(Dequant)
+constexpr AbsoluteTolerance<float> tolerance_dequantized(0.01f);
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallGEMMLowpDataset(),
+        make("accumulate", {true, false})
+    ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_dequantized);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeGEMMLowpDataset(),
+        make("accumulate", {false})
+    ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_dequantized);
+}
+TEST_SUITE_END() // Dequant
+#endif // __aarch64__
+
 TEST_SUITE_END() // MatrixMultiplyCore
 TEST_SUITE_END() // GEMMLowp
 TEST_SUITE_END() // NEON
diff --git a/tests/validation/NEON/ReorderLayer.cpp b/tests/validation/NEON/ReorderLayer.cpp
index 42fa0f8b00..839ad0ac92 100644
--- a/tests/validation/NEON/ReorderLayer.cpp
+++ b/tests/validation/NEON/ReorderLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,7 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ReorderFixture.h"
 #include "src/core/NEON/kernels/NEReorderKernel.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 
 namespace arm_compute
 {
@@ -40,6 +41,8 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 TEST_SUITE(NEON)
 TEST_SUITE(ReorderLayer)
 
@@ -48,13 +51,46 @@ using NEReorderLayerAlias = ReorderValidationFixture<Tensor, Accessor, NEReorder
 
 TEST_SUITE(FP32)
 #if defined(ARM_COMPUTE_ENABLE_SVE)
-FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), framework::dataset::make("DataType", DataType::F32)))
+DATA_TEST_CASE(ValidateReorderOHWIo8, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("InShape",{ TensorShape(10U, 9U), TensorShape(234U, 301U) }),
+                                                                     make("OutShape", { TensorShape(10U, 16U), TensorShape(234U, 304U) })
+                                                                    ),
+                                                                    zip(
+                                                                        make("InputWeightFormat", {WeightFormat::OHWI}),
+                                                                        make("OutputWeightFormat", {WeightFormat::OHWIo8})
+                                                                    )),
+            input_shape, output_shape,  input_wf,  output_wf)
+{
+    if(Scheduler::get().cpu_info().has_sve()){
+        arm_compute::NEReorderLayer reorder_layer;
+        int vector_length = arm_gemm::utils::get_vector_length<float>();
+        bool expected_bool_status = false;
+        if (vector_length == 8)
+        {
+            expected_bool_status = true;
+        }
+
+        TensorInfo input_tensor_info(input_shape, 1, DataType::F32);
+        TensorInfo output_tensor_info(output_shape, 1, DataType::F32);
+
+        Status status = reorder_layer.validate(&input_tensor_info, &output_tensor_info, input_wf, output_wf);
+
+        ARM_COMPUTE_EXPECT((expected_bool_status == bool(status)), framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunBlock8, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock8(), make("DataType", DataType::F32)))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    if (_hardware_supports)
+    {
+        validate(Accessor(_target), _reference);
+    }
 }
 #endif // ARM_COMPUTE_ENABLE_SVE
-FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), framework::dataset::make("DataType", DataType::F32)))
+
+FIXTURE_DATA_TEST_CASE(RunBlock4, NEReorderLayerAlias<float>, framework::DatasetMode::ALL, combine(datasets::ReorderLayerDatasetBlock4(), make("DataType", DataType::F32)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -68,4 +104,4 @@ TEST_SUITE_END() // NEON
 } // namespace test
 } // namespace arm_compute
 
-#endif  // defined(__aarch64__)
-\ No newline at end of file
+#endif  // defined(__aarch64__)
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 2397d81547..94d0866c38 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, 2022-2023 Arm Limited.
+ * Copyright (c) 2017-2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -122,40 +122,35 @@ template <typename T>
 using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftmaxLayer, T>;
 
 DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
-    concat(concat(
+    concat(
         combine(
-            make("CpuExt", std::string("NEON")),
+            make("CpuExt", std::string("neon")),
             make("DataType", { DataType::F32,
                             DataType::F16,
                             DataType::QASYMM8,
                             DataType::QASYMM8_SIGNED})
         ),
         combine(
-            make("CpuExt", std::string("SVE")),
+            make("CpuExt", std::string("sme2")),
             make("DataType", { DataType::F32,
                             DataType::F16}))
         ),
-        combine(
-            make("CpuExt", std::string("SVE2")),
-            make("DataType", { DataType::QASYMM8,
-                            DataType::QASYMM8_SIGNED}))
-        ),
         cpu_ext, data_type)
 {
     using namespace cpu::kernels;
 
     cpuinfo::CpuIsaInfo cpu_isa{};
-    cpu_isa.neon = (cpu_ext == "NEON");
-    cpu_isa.sve  = (cpu_ext == "SVE");
-    cpu_isa.sve2 = (cpu_ext == "SVE2");
+    cpu_isa.neon = (cpu_ext == "neon");
+    cpu_isa.sme2 = (cpu_ext == "sme2");
     cpu_isa.fp16 = (data_type == DataType::F16);
 
     const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
-        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */ }, cpu::KernelSelectionType::Preferred);
+        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */, CPUInfo::get().get_sme2_vector_length()},
+        cpu::KernelSelectionType::Preferred);
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
 
-    std::string expected = "neon_" + cpu_impl_dt(data_type) + "_softmax";
+    std::string expected = cpu_ext + "_" + cpu_impl_dt(data_type) + "_softmax";
     std::string actual   = selected_impl->name;
 
     ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
@@ -164,9 +159,19 @@ DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
 FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
     combine(
-        datasets::Small4DShapes(),
+        datasets::SmallShapes(),
         make("DataType", DataType::F16),
         make("Beta", { 1.0f, 2.0f }),
         make("Axis", { 0, 1 })))
@@ -178,7 +183,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::Datas
     combine(
         datasets::Small4DShapes(),
         make("DataType", DataType::F16),
-        make("Beta", { 1.0f, 2.0f }),
+        make("Beta", { 1.0f }),
         make("Axis", { 0, 2, -1 })))
 {
     // Validate output
diff --git a/tests/validation/NEON/UNIT/RuntimeContext.cpp b/tests/validation/NEON/UNIT/RuntimeContext.cpp
index 819811943d..e0d45c639a 100644
--- a/tests/validation/NEON/UNIT/RuntimeContext.cpp
+++ b/tests/validation/NEON/UNIT/RuntimeContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,6 +48,19 @@ namespace validation
 {
 TEST_SUITE(NEON)
 TEST_SUITE(UNIT)
+#if defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+TEST_CASE(CpuCapacity, framework::DatasetMode::ALL)
+{
+    CPUInfo& ci =  arm_compute::Scheduler::get().cpu_info();
+    const uint32_t nonlittle_num_cpus = ci.get_cpu_num_excluding_little();
+    const uint32_t num_threads = arm_compute::Scheduler::get().num_threads();
+
+    ARM_COMPUTE_EXPECT(num_threads<=nonlittle_num_cpus , framework::LogLevel::ERRORS);
+}
+#endif /* defined(ARM_COMPUTE_OPENMP_SCHEDULER) && !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+
 TEST_SUITE(RuntimeContext)
 
 TEST_CASE(Scheduler, framework::DatasetMode::ALL)
diff --git a/tests/validation/UNIT/CPPScheduler.cpp b/tests/validation/UNIT/CPPScheduler.cpp
index 52431653b5..6a3f6819fc 100644
--- a/tests/validation/UNIT/CPPScheduler.cpp
+++ b/tests/validation/UNIT/CPPScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,8 +68,7 @@ public:
 
 TEST_SUITE(UNIT)
 TEST_SUITE(CPPScheduler)
-
-#if !defined(BARE_METAL)
+#if defined(ARM_COMPUTE_CPP_SCHEDULER) && !defined(BARE_METAL)
 TEST_CASE(RethrowException, framework::DatasetMode::ALL)
 {
     CPPScheduler scheduler;
@@ -87,7 +86,6 @@ TEST_CASE(RethrowException, framework::DatasetMode::ALL)
     }
     ARM_COMPUTE_EXPECT_FAIL("Expected exception not caught", framework::LogLevel::ERRORS);
 }
-#endif // !defined(BARE_METAL)
-
+#endif // defined(ARM_COMPUTE_CPP_SCHEDULER) &&  !defined(BARE_METAL)
 TEST_SUITE_END()
 TEST_SUITE_END()
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index afde3d8067..94bedc83e1 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,14 +46,14 @@ namespace test
 namespace validation
 {
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool disable_c = false, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool pretranspose_a = false, bool pretranspose_b = false, bool run_twice = false>
-class GEMMValidationFixture : public framework::Fixture
+class GEMMGenericValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type, bool accumulate=false)
     {
         ARM_COMPUTE_UNUSED(pretranspose);
-        _target    = compute_target(shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type);
-        _reference = compute_reference(shape_a, shape_b, output_shape, alpha, beta, data_type);
+        _target    = compute_target(shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type, accumulate);
+        _reference = compute_reference(shape_a, shape_b, output_shape, alpha, beta, data_type, accumulate);
     }
 
 protected:
@@ -80,7 +80,7 @@ protected:
     }
 
     TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c, const TensorShape &output_shape, float alpha, float beta,
-                              DataType data_type)
+                              DataType data_type, bool accumulate=false)
     {
         // Create tensors
         TensorType a   = create_tensor<TensorType>(shape_a, data_type, 1);
@@ -99,7 +99,7 @@ protected:
                        &dst,
                        alpha, beta,
                        GEMMInfo(false, false, false, (reinterpret_output_as_3d ? output_shape[2] : 0), reinterpret_input_as_3d, false, GEMMLowpOutputStageInfo(), false, false, (reinterpret_input_as_3d
-                                || reinterpret_output_as_3d)));
+                                || reinterpret_output_as_3d), arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED, false /* pretranspose_B */, accumulate));
         ARM_COMPUTE_ASSERT(a.info()->is_resizable());
         ARM_COMPUTE_ASSERT(b.info()->is_resizable());
         ARM_COMPUTE_ASSERT(c.info()->is_resizable());
@@ -121,11 +121,14 @@ protected:
         // Fill tensors
         fill(AccessorType(a), 0);
         fill(AccessorType(b), 1);
+        if (accumulate)
+        {
+            fill(AccessorType(dst), 6);
+        }
         if(!disable_c)
         {
             fill(AccessorType(c), 2);
         }
-
         // Run with variable inputs.
         if(run_twice)
         {
@@ -145,7 +148,7 @@ protected:
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &output_shape, float alpha, float beta,
-                                      DataType data_type)
+                                      DataType data_type, bool accumulate=false)
     {
         TensorShape shape_a_to_use = shape_a;
         if(reinterpret_input_as_3d)
@@ -158,6 +161,7 @@ protected:
         SimpleTensor<T> a{ shape_a_to_use, data_type, 1 };
         SimpleTensor<T> b{ shape_b, data_type, 1 };
         SimpleTensor<T> c{ output_shape, data_type, 1 };
+        SimpleTensor<T> dst{ output_shape, data_type, 1 };
 
         // Fill reference
         fill(a, 0);
@@ -211,17 +215,51 @@ protected:
             fill(c, 5);
         }
 
+        // Do in place summation
+        if (accumulate)
+        {
+            fill(dst, 6);
+        }
+
         // Setting beta to 0 will effectively disable C for the
         // computation of the reference: alpha * A * B + 0 * C
         // Use transposed tensors if boolean enabled else use original tensors
-        auto r = reference::gemm<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, alpha, disable_c ? 0.f : beta);
-        return r;
+        if (accumulate)
+        {
+            reference::gemm_accumulate<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, alpha, disable_c ? 0.f : beta, dst);
+            return dst;
+        }
+        else
+        {
+            return reference::gemm<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, alpha, disable_c ? 0.f : beta);
+        }
     }
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool disable_c = false, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool pretranspose_a = false, bool pretranspose_b = false, bool run_twice = false>
+class GEMMValidationFixture : protected GEMMGenericValidationFixture<TensorType, AccessorType, FunctionType, T, disable_c, reinterpret_input_as_3d, reinterpret_output_as_3d, pretranspose_a, pretranspose_b, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type)
+    {
+        GEMMGenericValidationFixture<TensorType, AccessorType, FunctionType, T, disable_c, reinterpret_input_as_3d, reinterpret_output_as_3d, pretranspose_a, pretranspose_b, run_twice>::setup(shape_a, shape_b, shape_c, output_shape, alpha, beta, pretranspose, data_type, false /*accumulate*/);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool disable_c = false, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool pretranspose_a = false, bool pretranspose_b = false, bool run_twice = false>
+class GEMMAccumulateValidationFixture : protected GEMMGenericValidationFixture<TensorType, AccessorType, FunctionType, T, disable_c, reinterpret_input_as_3d, reinterpret_output_as_3d, pretranspose_a, pretranspose_b, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type)
+    {
+        bool accumulate = true;
+        GEMMGenericValidationFixture<TensorType, AccessorType, FunctionType, T, disable_c, reinterpret_input_as_3d, reinterpret_output_as_3d, pretranspose_a, pretranspose_b, run_twice>::setup(shape_a, shape_b, shape_c, output_shape, alpha, beta, pretranspose, data_type, accumulate);
+    }
+};
+
 template <typename TensorType, typename AccessorType, typename T, typename GEMMOperatorType>
 class GEMMMatrixMultiplyValidationFixture : public framework::Fixture
 {
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index a65a1e6bd8..aa4eedb75d 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -30,6 +30,8 @@
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/reference/GEMMLowp.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
+#include "tests/validation/reference/DequantizationLayer.h"
 
 #include <cstdint>
 #include <vector>
@@ -42,20 +44,35 @@ namespace validation
 {
 namespace
 {
-
 template <typename U>
 void fill(U &&tensor, int i)
 {
+    library->fill_tensor_uniform(tensor, i);
+}
+
+template <typename U>
+void fill_quantized(U &&tensor, int i)
+{
     ARM_COMPUTE_ASSERT(is_data_type_quantized(tensor.data_type()));
     library->fill_tensor_uniform(tensor, i);
 }
 
 template <typename U>
-void fill_bias_s32(U &&tensor, int i, int32_t min, int32_t max)
+void fill(U &&tensor, int i, int32_t min, int32_t max)
 {
-    ARM_COMPUTE_ASSERT(tensor.data_type() == DataType::S32);
-    std::uniform_int_distribution<int32_t> distribution(min, max);
-    library->fill(tensor, distribution, i);
+    if (tensor.data_type() == DataType::S32) {
+        std::uniform_int_distribution<int32_t> distribution(min, max);
+        library->fill(tensor, distribution, i);
+    }
+    else if(tensor.data_type() == DataType::F32)
+    {
+        std::uniform_real_distribution<float> distribution((float)min, (float)max);
+        library->fill(tensor, distribution, i);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
 }
 
 /** Information about how to fill tensors */
@@ -64,6 +81,11 @@ struct TensorFillInfo
     // Bias fill range. Default values are arbitrary
     int32_t min_bias {-20000};
     int32_t max_bias {20000};
+
+    // Output fill range. Default values are arbitrary
+    int32_t min_output {-20000};
+    int32_t max_output {20000};
+
     // Optional extra hash to randomize tensor filling
     int32_t hash     {0};
 };
@@ -71,29 +93,42 @@ struct TensorFillInfo
 template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d, bool reinterpret_output_as_3d, typename OutputType, bool is_fused = false, bool run_twice = false>
 TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
                                    const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8,
-                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo() )
+                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(),
+                                   bool accumulate = false, bool dynamic_qinfo = false, DataType data_type_output = DataType::UNKNOWN)
 {
     ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a));
     ARM_COMPUTE_ASSERT(data_type_a == data_type_b);
-    // Create tensors
-    const DataType data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
+    // If unknown, set to sensible defaults
+    if (data_type_output == DataType::UNKNOWN) {
+        data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
+    }
 
-    TensorType a      = create_tensor<TensorType>(shape_a, data_type_a, 1, a_qinfo);
-    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1, b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
+    // Create tensors
+    TensorType a      = create_tensor<TensorType>(shape_a, data_type_a, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : a_qinfo);
+    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
     TensorType output = create_tensor<TensorType>(shape_output, data_type_output, 1, output_qinfo /* output_qinfo will be ignored when output stage type is None */);
 
     TensorType bias;
     if(is_fused)
     {
         TensorShape bias_shape(shape_b[0]);
-        bias = create_tensor<TensorType>(bias_shape, DataType::S32, 1);
+        bias = create_tensor<TensorType>(bias_shape,data_type_output == DataType::F32 ? DataType::F32 : DataType::S32, 1);
     }
 
     // Create and configure function
     // The GEMMinfo includes the values of the depth in case of reinterpreted 3d input/output
     FunctionType gemmlowp;
     gemmlowp.configure(&a, &b, is_fused ? &bias : nullptr, &output, GEMMInfo(false, false, reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false,
-                                                                             output_stage));
+                                                                             output_stage, false /*fp_mixed_precision*/, false /*fast_math*/, false /*broadcast_bias*/,
+                                                                             arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED,
+                                                                             false /* pretranspose_B */, accumulate));
+
+    // If the QuantizationInfo is dynamic, it needs to be settable after configure (note that we also force it to be dynamic)
+    if (dynamic_qinfo)
+    {
+        a.info()->set_quantization_info(QuantizationInfo(a_qinfo.scale(), a_qinfo.offset(), true));
+        b.info()->set_quantization_info(QuantizationInfo(b_qinfo.scale(), b_qinfo.offset(), true));
+    }
 
     ARM_COMPUTE_ASSERT(a.info()->is_resizable());
     ARM_COMPUTE_ASSERT(b.info()->is_resizable());
@@ -111,26 +146,32 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape
     ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
 
     // Fill tensors
-    fill(AccessorType(a), 0 + finfo.hash);
-    fill(AccessorType(b), 1 + finfo.hash);
+    fill_quantized(AccessorType(a), 0 + finfo.hash);
+    fill_quantized(AccessorType(b), 1 + finfo.hash);
+
+    if (accumulate)
+    {
+        ARM_COMPUTE_ASSERT(accumulate != run_twice);
+        fill(AccessorType(output), 6 + finfo.hash, finfo.min_output, finfo.max_output);
+    }
 
     if(is_fused)
     {
         ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
         bias.allocator()->allocate();
         ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
-        fill_bias_s32(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias);
+        fill(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias);
     }
 
     // Run with variable inputs.
     if(run_twice)
     {
         gemmlowp.run();
-        fill(AccessorType(a), 3 + finfo.hash); // Fill tensors with new seed after run
-        fill(AccessorType(b), 4 + finfo.hash);
+        fill_quantized(AccessorType(a), 3 + finfo.hash); // Fill tensors with new seed after run
+        fill_quantized(AccessorType(b), 4 + finfo.hash);
         if(is_fused)
         {
-            fill_bias_s32(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias);
+            fill(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias);
         }
     }
 
@@ -168,8 +209,8 @@ SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, con
     SimpleTensor<TW> b_transposed{ shape_b_transposed, data_type_b, 1, b_qinfo };
 
     // Fill reference
-    fill(a, 0 + finfo.hash);
-    fill(b, 1 + finfo.hash);
+    fill_quantized(a, 0 + finfo.hash);
+    fill_quantized(b, 1 + finfo.hash);
 
     // Transpose reference if required
     /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M),
@@ -189,11 +230,12 @@ SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, con
     // Run with variable inputs.
     const int32_t a_offset = a_qinfo.uniform().offset;
     const int32_t b_offset = b_qinfo.uniform().offset;
+
     if(run_twice)
     {
         reference::gemmlowp_matrix_multiply_core<int32_t, TI, TW>((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset);
-        fill((pretranspose_A) ? a_transposed : a, 3 + finfo.hash);
-        fill((pretranspose_B) ? b_transposed : b, 4 + finfo.hash);
+        fill_quantized((pretranspose_A) ? a_transposed : a, 3 + finfo.hash);
+        fill_quantized((pretranspose_B) ? b_transposed : b, 4 + finfo.hash);
     }
 
     return reference::gemmlowp_matrix_multiply_core<int32_t, TI, TW>((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset);
@@ -201,35 +243,77 @@ SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, con
 } // namespace
 
 template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
-class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture
+class GEMMLowpGenericMatrixMultiplyCoreValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate=false, bool dynamic_qinfo = false)
     {
         const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
         const auto b_qinfo = QuantizationInfo(1.0f / 255, b_offset);
-        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo);
-        _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo);
+        TensorFillInfo finfo;
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate);
     }
 
 protected:
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo)
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo)
     {
         const auto output_qinfo = QuantizationInfo(); // No output stage
-        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo);
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8, DataType::QASYMM8, GEMMLowpOutputStageInfo(), false, finfo, accumulate, dynamic_qinfo);
     }
 
-    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo)
+    SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, bool accumulate)
     {
-        return compute_gemmlowp_reference<reinterpret_input_as_3d, uint8_t, uint8_t, false, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo);
+        SimpleTensor<int32_t> ref_output =  compute_gemmlowp_reference<reinterpret_input_as_3d, uint8_t, uint8_t, false, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo,
+        DataType::QASYMM8, DataType::QASYMM8, finfo);
+
+        if (accumulate)
+        {
+            SimpleTensor<int32_t> output{ shape_output, DataType::S32, 1 };
+            fill(output, 6 + finfo.hash, finfo.min_output, finfo.max_output);
+            reference::arithmetic_operation<int32_t>(reference::ArithmeticOperation::ADD, output, ref_output, output, ConvertPolicy::SATURATE);
+            return output;
+        }
+
+        return ref_output;
     }
 
     TensorType            _target{};
     SimpleTensor<int32_t> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpMatrixMultiplyCoreValidationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    {
+        GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>::setup(shape_a, shape_b, shape_output, a_offset, b_offset, false /* accumulate */);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpMatrixMultiplyAccumulateValidationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    {
+        GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>::setup(shape_a, shape_b, shape_output, a_offset, b_offset, true /* accumulate */);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    {
+        GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>::setup(shape_a, shape_b, shape_output, a_offset, b_offset, false /* accumulate */, true /* dynamic_qinfo */);
+    }
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t, bool run_twice = false>
-class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture : public framework::Fixture
+class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public framework::Fixture
 {
 public:
     /** Dynamically initialize the quantization info with saturation awareness
@@ -363,16 +447,16 @@ protected:
 
         TensorShape           bias_shape(shape_b[0]);
         SimpleTensor<int32_t> bias{ bias_shape, DataType::S32, 1 };
-        (run_twice) ? fill_bias_s32(bias, 5 + finfo.hash, finfo.min_bias, finfo.max_bias) : fill_bias_s32(bias, 2 + finfo.hash, finfo.min_bias, finfo.max_bias); // Fill bias with same seed as last run of gemmlowp_target
+        (run_twice) ? fill(bias, 5 + finfo.hash, finfo.min_bias, finfo.max_bias) : fill(bias, 2 + finfo.hash, finfo.min_bias, finfo.max_bias); // Fill bias with same seed as last run of gemmlowp_target
 
         switch(output_stage.type)
         {
             case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-                return reference::gemmlowp_quantize_down_scale<int32_t, TW>(output, bias,
+                return reference::gemmlowp_quantize_down_scale<int32_t, TI>(output, bias,
                                                                             output_stage.gemmlowp_offset, output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
                 break;
             case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-                return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, TW>(output, bias,
+                return reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, TI>(output, bias,
                                                                                           output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
                 break;
             default:
@@ -384,15 +468,71 @@ protected:
     SimpleTensor<TI> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t>
-class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public
-    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW>
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpDequantizedMatrixMultiplyValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate)
+    {
+        const bool dynamic_qinfo = false;
+        const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
+        const auto b_qinfo = QuantizationInfo(5.0f / 255, b_offset);
+        TensorFillInfo finfo;
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
+    }
+
+protected:
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo)
+    {
+        const auto output_qinfo = QuantizationInfo();
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, GEMMLowpOutputStageInfo(), false, finfo, accumulate, dynamic_qinfo, DataType::F32);
+    }
+
+    SimpleTensor<float> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, bool accumulate, const bool dynamic_qinfo)
+    {
+        QuantizationInfo s32_ref_output_quant_info = QuantizationInfo(a_qinfo.uniform().scale * b_qinfo.uniform().scale, 0, dynamic_qinfo);
+
+        SimpleTensor<int32_t> s32_ref_output =  compute_gemmlowp_reference<reinterpret_input_as_3d, int8_t, int8_t, false, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo,
+        DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, finfo);
+        s32_ref_output.quantization_info(s32_ref_output_quant_info);
+
+        SimpleTensor<float> f32_ref_output(s32_ref_output.shape(), DataType::F32);
+        f32_ref_output = reference::dequantization_layer<float, int32_t>(s32_ref_output);
+
+        if (accumulate)
+        {
+            SimpleTensor<float> output{ shape_output, DataType::F32, 1 };
+            fill(output, 6 + finfo.hash, finfo.min_output, finfo.max_output);
+            reference::arithmetic_operation<float>(reference::ArithmeticOperation::ADD, output, f32_ref_output, output, ConvertPolicy::SATURATE);
+            return output;
+        }
+
+        return f32_ref_output;
+    }
+
+    TensorType            _target{};
+    SimpleTensor<float> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t, bool run_twice = false>
+class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type, bool reshape_b_only_on_first_run)
+    {
+        GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW, run_twice>::setup(shape_a, shape_b,
+                shape_output, output_stage_type, data_type, reshape_b_only_on_first_run);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t, bool run_twice = false>
+class GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture : public GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW, run_twice>
 {
 public:
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type, bool reshape_b_only_on_first_run)
     {
-        GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW>::setup(shape_a, shape_b,
-                shape_output, output_stage_type, data_type, false /* reshape_b_only_on_first_run */);
+        GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW, run_twice>::setup(shape_a, shape_b, shape_output, output_stage_type, data_type, reshape_b_only_on_first_run);
     }
 };
 
diff --git a/tests/validation/fixtures/ReorderFixture.h b/tests/validation/fixtures/ReorderFixture.h
index 36e62696bc..8e28484c48 100644
--- a/tests/validation/fixtures/ReorderFixture.h
+++ b/tests/validation/fixtures/ReorderFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE
-#define ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE_H
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -32,6 +32,7 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/reference/Reorder.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 
 namespace arm_compute
 {
@@ -44,10 +45,23 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ReorderValidationFixture : public framework::Fixture
 {
 public:
+    void check_hardware_supports(WeightFormat output_wf){
+        if(!Scheduler::get().cpu_info().has_sve() && output_wf!=WeightFormat::OHWIo4){
+            _hardware_supports = false;
+        }
+        if (Scheduler::get().cpu_info().has_sve() && arm_gemm::utils::get_vector_length<float>() != 8 && output_wf==WeightFormat::OHWIo8)
+        {
+            _hardware_supports = false;
+        }
+    }
+
     void setup(TensorShape input_shape, TensorShape output_shape, WeightFormat input_wf, WeightFormat output_wf, DataType data_type)
     {
-        _target    = compute_target(input_shape, output_shape, input_wf, output_wf, data_type);
-        _reference = compute_reference(input_shape, output_shape, output_wf, data_type);
+        check_hardware_supports(output_wf);
+        if (_hardware_supports){
+            _target    = compute_target(input_shape, output_shape, input_wf, output_wf, data_type);
+            _reference = compute_reference(input_shape, output_shape, output_wf, data_type);
+        }
     }
 
     protected:
@@ -98,6 +112,7 @@ public:
         return reference::reorder_layer<T>(src, output_shape, output_wf);
     }
 
+    bool _hardware_supports = true;
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
@@ -105,4 +120,4 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_REORDERFIXTURE_H
diff --git a/tests/validation/fixtures/ScatterLayerFixture.h b/tests/validation/fixtures/ScatterLayerFixture.h
index bda5532a51..af161ef98b 100644
--- a/tests/validation/fixtures/ScatterLayerFixture.h
+++ b/tests/validation/fixtures/ScatterLayerFixture.h
@@ -27,8 +27,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "tests/Globals.h"
-#include "tests/framework/Asserts.h" // Required for ARM_COMPUTE_ASSERT
+#include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/reference/ScatterLayer.h"
 #include "tests/SimpleTensor.h"
@@ -46,21 +47,46 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ScatterGenericValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape src_shape, TensorShape updates_shape, TensorShape indices_shape, TensorShape out_shape, DataType data_type, ScatterInfo scatter_info, QuantizationInfo src_qinfo = QuantizationInfo(), QuantizationInfo o_qinfo = QuantizationInfo())
+    void setup(TensorShape src_shape, TensorShape updates_shape, TensorShape indices_shape,
+        TensorShape out_shape, DataType data_type, ScatterInfo scatter_info, bool inplace, bool padding,
+        QuantizationInfo src_qinfo = QuantizationInfo(), QuantizationInfo o_qinfo = QuantizationInfo())
     {
-        _target    = compute_target(src_shape, updates_shape, indices_shape,  out_shape, data_type, scatter_info, src_qinfo, o_qinfo);
+        // this is for improving randomness across tests
+        _hash = src_shape[0] + src_shape[1] + src_shape[2] + src_shape[3] + src_shape[4] + src_shape[5]
+              + updates_shape[0] + updates_shape[1] + updates_shape[2] + updates_shape[3]
+              + updates_shape[4] + updates_shape[5]
+              + indices_shape[0] + indices_shape[1] + indices_shape[2] + indices_shape[3];
+
+        _target    = compute_target(src_shape, updates_shape, indices_shape,  out_shape, data_type, scatter_info, inplace, padding, src_qinfo, o_qinfo);
         _reference = compute_reference(src_shape, updates_shape, indices_shape,  out_shape, data_type,scatter_info, src_qinfo , o_qinfo);
     }
 
 protected:
     template <typename U>
-    void fill(U &&tensor, int i, float lo = -1.f, float hi = 1.f)
+    void fill(U &&tensor, int i)
     {
         switch(tensor.data_type())
         {
             case DataType::F32:
+            case DataType::F16:
+            {
+                std::uniform_real_distribution<float> distribution(-10.f, 10.f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::S32:
+            case DataType::S16:
+            case DataType::S8:
+            {
+                std::uniform_int_distribution<int32_t> distribution(-100, 100);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::U32:
+            case DataType::U16:
+            case DataType::U8:
             {
-                std::uniform_real_distribution<float> distribution(lo, hi);
+                std::uniform_int_distribution<uint32_t> distribution(0, 200);
                 library->fill(tensor, distribution, i);
                 break;
             }
@@ -71,37 +97,47 @@ protected:
         }
     }
 
-    // This is used to fill indices tensor with U32 datatype.
+    // This is used to fill indices tensor with S32 datatype.
     // Used to prevent ONLY having values that are out of bounds.
     template <typename U>
     void fill_indices(U &&tensor, int i, const TensorShape &shape)
     {
-        // Calculate max indices the shape should contain. Add an arbitrary constant to allow testing for some out of bounds values.
-        const uint32_t max = std::max({shape[0] , shape[1], shape[2]}) + 5;
-        library->fill_tensor_uniform(tensor, i, static_cast<uint32_t>(0), static_cast<uint32_t>(max));
+        // Calculate max indices the shape should contain. Add an arbitrary value to allow testing for some out of bounds values (In this case min dimension)
+        const int32_t max = std::min({shape[0] , shape[1], shape[2]}) + 1;
+        library->fill_tensor_uniform(tensor, i, static_cast<int32_t>(0), static_cast<int32_t>(max));
     }
 
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c, const TensorShape &out_shape, DataType data_type, const ScatterInfo info, QuantizationInfo a_qinfo, QuantizationInfo o_qinfo)
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c,
+        const TensorShape &out_shape, DataType data_type, const ScatterInfo info, bool inplace, bool padding,
+        QuantizationInfo a_qinfo, QuantizationInfo o_qinfo)
     {
         // 1. Create relevant tensors using ScatterInfo data structure.
         // ----------------------------------------------------
         // In order - src, updates, indices, output.
         TensorType src   = create_tensor<TensorType>(shape_a, data_type, 1, a_qinfo);
         TensorType updates   = create_tensor<TensorType>(shape_b, data_type, 1, a_qinfo);
-        TensorType indices   = create_tensor<TensorType>(shape_c, DataType::U32, 1, QuantizationInfo());
+        TensorType indices   = create_tensor<TensorType>(shape_c, DataType::S32, 1, QuantizationInfo());
         TensorType dst = create_tensor<TensorType>(out_shape, data_type, 1, o_qinfo);
 
         FunctionType scatter;
 
         // Configure operator
-        // When scatter_info.zero_initialization is true, pass nullptr to scatter function.
+        // When scatter_info.zero_initialization is true, pass nullptr for src
+        // because dst does not need to be initialized with src values.
         if(info.zero_initialization)
         {
             scatter.configure(nullptr, &updates, &indices, &dst, info);
         }
         else
         {
-            scatter.configure(&src, &updates, &indices, &dst, info);
+            if(inplace)
+            {
+                scatter.configure(&src, &updates, &indices, &src, info);
+            }
+            else
+            {
+                scatter.configure(&src, &updates, &indices, &dst, info);
+            }
         }
 
         // Assertions
@@ -110,51 +146,92 @@ protected:
         ARM_COMPUTE_ASSERT(indices.info()->is_resizable());
         ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
+        if(padding)
+        {
+            add_padding_x({ &src, &updates, &indices});
+
+            if(!inplace)
+            {
+                add_padding_x({ &dst });
+            }
+        }
+
         // Allocate tensors
         src.allocator()->allocate();
         updates.allocator()->allocate();
         indices.allocator()->allocate();
-        dst.allocator()->allocate();
+
+        if(!inplace)
+        {
+            dst.allocator()->allocate();
+        }
 
         ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!updates.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!indices.info()->is_resizable());
-        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        if(!inplace)
+        {
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        }
 
         // Fill update (a) and indices (b) tensors.
-        fill(AccessorType(src), 0);
-        fill(AccessorType(updates), 1);
-        fill_indices(AccessorType(indices), 2, out_shape);
+        fill(AccessorType(src), 0 + _hash);
+        fill(AccessorType(updates), 1+ _hash);
+        fill_indices(AccessorType(indices), 2 + _hash, out_shape);
 
         scatter.run();
 
-        return dst;
+        if(inplace)
+        {
+            return src;
+        }
+        else
+        {
+            return dst;
+        }
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &a_shape, const TensorShape &b_shape, const TensorShape &c_shape, const TensorShape &out_shape, DataType data_type,
-                                      ScatterInfo info, QuantizationInfo a_qinfo, QuantizationInfo o_qinfo)
+    SimpleTensor<T> compute_reference(const TensorShape &a_shape, const TensorShape &b_shape, const TensorShape &c_shape,
+        const TensorShape &out_shape, DataType data_type, ScatterInfo info, QuantizationInfo a_qinfo, QuantizationInfo o_qinfo)
     {
         // Output Quantization not currently in use - fixture should be extended to support this.
         ARM_COMPUTE_UNUSED(o_qinfo);
+        TensorShape src_shape = a_shape;
+        TensorShape updates_shape = b_shape;
+        TensorShape indices_shape = c_shape;
+        const int num_ind_dims = c_shape.num_dimensions();
+
+        // 1. Collapse batch index into a single dim if necessary for update tensor and indices tensor.
+        if(num_ind_dims >= 3)
+        {
+            indices_shape = indices_shape.collapsed_from(1);
+            updates_shape = updates_shape.collapsed_from(updates_shape.num_dimensions() - (num_ind_dims -1)); // Collapses batch dims
+        }
+
+        // 2. Collapse data dims into a single dim.
+        //    Collapse all src dims into 2 dims. First one holding data, the other being the index we iterate over.
+        src_shape.collapse(updates_shape.num_dimensions() - 1);     // Collapse all data dims into single dim.
+        src_shape = src_shape.collapsed_from(1);                    // Collapse all index dims into a single dim
+        updates_shape.collapse(updates_shape.num_dimensions() - 1); // Collapse data dims (all except last dim which is batch dim)
 
         // Create reference tensors
-        SimpleTensor<T> src{ a_shape, data_type, 1, a_qinfo };
-        SimpleTensor<T> updates{b_shape, data_type, 1, QuantizationInfo() };
-        SimpleTensor<uint32_t> indices{ c_shape, DataType::U32, 1, QuantizationInfo() };
+        SimpleTensor<T> src{ src_shape, data_type, 1, a_qinfo };
+        SimpleTensor<T> updates{updates_shape, data_type, 1, QuantizationInfo() };
+        SimpleTensor<int32_t> indices{ indices_shape, DataType::S32, 1, QuantizationInfo() };
 
         // Fill reference
-        fill(src, 0);
-        fill(updates, 1);
-        fill_indices(indices, 2, out_shape);
-
-        // Calculate individual reference.
-        auto result = reference::scatter_layer<T>(src, updates, indices, out_shape, info);
+        fill(src, 0 + _hash);
+        fill(updates, 1 + _hash);
+        fill_indices(indices, 2 + _hash, out_shape);
 
-        return result;
+        // Calculate individual reference using collapsed shapes
+        return reference::scatter_layer<T>(src, updates, indices, out_shape, info);
     }
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
+    int32_t _hash{};
 };
 
 // This fixture will use the same shape for updates as indices.
@@ -162,9 +239,12 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class ScatterValidationFixture : public ScatterGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
 {
 public:
-    void setup(TensorShape src_shape, TensorShape update_shape, TensorShape indices_shape,  TensorShape out_shape, DataType data_type, ScatterFunction func, bool zero_init)
+    void setup(TensorShape src_shape, TensorShape update_shape, TensorShape indices_shape,
+        TensorShape out_shape, DataType data_type, ScatterFunction func, bool zero_init, bool inplace, bool padding)
     {
-        ScatterGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, update_shape, indices_shape, out_shape, data_type, ScatterInfo(func, zero_init), QuantizationInfo(), QuantizationInfo());
+        ScatterGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, update_shape,
+            indices_shape, out_shape, data_type, ScatterInfo(func, zero_init), inplace, padding,
+            QuantizationInfo(), QuantizationInfo());
     }
 };
 
diff --git a/tests/validation/reference/DequantizationLayer.cpp b/tests/validation/reference/DequantizationLayer.cpp
index 64a89aa6a0..67d69c2c38 100644
--- a/tests/validation/reference/DequantizationLayer.cpp
+++ b/tests/validation/reference/DequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,12 @@ TOut dequantize(int16_t val, const UniformQuantizationInfo qinfo, DataType dt)
     ARM_COMPUTE_UNUSED(dt);
     return static_cast<TOut>(dequantize_qsymm16(val, qinfo));
 }
+template <typename TOut>
+TOut dequantize(int32_t val, const UniformQuantizationInfo qinfo, DataType dt)
+{
+    ARM_COMPUTE_UNUSED(dt);
+    return static_cast<TOut>(dequantize_s32(val, qinfo));
+}
 } // namespace
 template <typename TOut, typename TIn>
 SimpleTensor<TOut> dequantization_layer(const SimpleTensor<TIn> &src)
@@ -115,6 +121,7 @@ template SimpleTensor<half> dequantization_layer(const SimpleTensor<int8_t> &src
 template SimpleTensor<float> dequantization_layer(const SimpleTensor<int8_t> &src);
 template SimpleTensor<half> dequantization_layer(const SimpleTensor<int16_t> &src);
 template SimpleTensor<float> dequantization_layer(const SimpleTensor<int16_t> &src);
+template SimpleTensor<float> dequantization_layer(const SimpleTensor<int32_t> &src);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index 20f1139a02..d513343796 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021,2024 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
 
 namespace arm_compute
 {
@@ -180,17 +181,22 @@ SimpleTensor<T> gemm_mixed_precision(
     return dst;
 }
 
-template SimpleTensor<float>
-gemm(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta);
-template SimpleTensor<bfloat16> gemm(const SimpleTensor<bfloat16> &a,
-                                     const SimpleTensor<bfloat16> &b,
-                                     const SimpleTensor<bfloat16> &c,
-                                     float                         alpha,
-                                     float                         beta);
-template SimpleTensor<half>
-gemm(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
-template SimpleTensor<half> gemm_mixed_precision(
-    const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+void gemm_accumulate(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta, SimpleTensor<T> &dst)
+{
+    // Compute reference
+    SimpleTensor<T> dst_gemm = gemm(a, b, c, alpha, beta);
+    reference::arithmetic_operation<T>(reference::ArithmeticOperation::ADD, dst, dst_gemm, dst, ConvertPolicy::SATURATE);
+}
+
+template SimpleTensor<bfloat16> gemm(const SimpleTensor<bfloat16> &a, const SimpleTensor<bfloat16> &b, const SimpleTensor<bfloat16> &c, float alpha, float beta);
+template SimpleTensor<float> gemm(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta);
+template SimpleTensor<half> gemm(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
+
+template void gemm_accumulate(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta, SimpleTensor<float> &dst);
+template void gemm_accumulate(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta, SimpleTensor<half> &dst);
+
+template SimpleTensor<half> gemm_mixed_precision(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMM.h b/tests/validation/reference/GEMM.h
index 5feaeda584..1b97570122 100644
--- a/tests/validation/reference/GEMM.h
+++ b/tests/validation/reference/GEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMM_H
-#define ARM_COMPUTE_TEST_GEMM_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_GEMM_H
+#define ACL_TESTS_VALIDATION_REFERENCE_GEMM_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -41,8 +41,11 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta);
 
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+void gemm_accumulate(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta, SimpleTensor<T> &dst);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMM_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_GEMM_H
diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 1615b51e73..30c577d850 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "GEMMLowp.h"
 
 #include "arm_compute/core/Types.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
 #include "support/ToolchainSupport.h"
@@ -230,6 +231,13 @@ SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, c
     return c;
 }
 
+template <typename T_out, typename T_in, typename T_in_1>
+void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<T_in> &a, const SimpleTensor<T_in_1> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<T_out> &dst)
+{
+    SimpleTensor<T_out> dst_gemm = gemmlowp_matrix_multiply_core<T_out, T_in, T_in_1>(a, b, shape_c, a_offset, b_offset);
+    reference::arithmetic_operation<T_out>(reference::ArithmeticOperation::ADD, dst, dst_gemm, dst, ConvertPolicy::SATURATE);
+}
+
 // used to validate assembly kernels which don't know anything about offsets
 template <typename T1, typename T2, typename T3>
 SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c)
@@ -336,6 +344,8 @@ template SimpleTensor<int8_t> gemmlowp_quantize_down_scale(const SimpleTensor<in
                                                            std::vector<int32_t> result_shift, int32_t min, int32_t max);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
+template void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<int32_t> &dst);
+template void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<int32_t> &dst);
 template SimpleTensor<int32_t> gemmlowp<int32_t, int8_t, int8_t>(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, uint8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, int8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
diff --git a/tests/validation/reference/GEMMLowp.h b/tests/validation/reference/GEMMLowp.h
index 99015d71fb..6e471fdad1 100644
--- a/tests/validation/reference/GEMMLowp.h
+++ b/tests/validation/reference/GEMMLowp.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMMLOWP_H
-#define ARM_COMPUTE_TEST_GEMMLOWP_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_GEMMLOWP_H
+#define ACL_TESTS_VALIDATION_REFERENCE_GEMMLOWP_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -38,6 +38,9 @@ namespace reference
 template <typename T1, typename T2, typename T3>
 SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
 
+template <typename T1, typename T2, typename T3>
+void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<T1> &dst_);
+
 template <typename T1, typename T2, typename T3 = T2>
 SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c);
 
@@ -71,4 +74,4 @@ SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn>
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMLOWP_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_GEMMLOWP_H
diff --git a/tests/validation/reference/QuantizationLayer.cpp b/tests/validation/reference/QuantizationLayer.cpp
index 27665375c3..ad7ba7ac43 100644
--- a/tests/validation/reference/QuantizationLayer.cpp
+++ b/tests/validation/reference/QuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/reference/ScatterLayer.cpp b/tests/validation/reference/ScatterLayer.cpp
index 920f2b9990..55c48a9002 100644
--- a/tests/validation/reference/ScatterLayer.cpp
+++ b/tests/validation/reference/ScatterLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "ScatterLayer.h"
 #include "tests/validation/Helpers.h"
+#include "arm_compute/core/TensorShape.h"
 
 namespace arm_compute
 {
@@ -62,51 +63,89 @@ T reduce_op(const T &current,const T &update,const ScatterFunction func)
 }
 
 template float reduce_op(const float &current,const float &update,const ScatterFunction func);
+template half reduce_op(const half &current,const half &update,const ScatterFunction func);
 }
 
-// Note : This function currently only supports 1D src, 1D updates, 2D indices, 1D output tensors.
+// NOTE: This function expects collapsed tensors as input.
+// Batch dims for update/indices tensors should be collapsed into a single dim.
+// Data dims should be collapsed into a single dim for both update and src tensors prior to calling this function.
 template <typename T>
-SimpleTensor<T> scatter_layer_internal(const SimpleTensor<T> &src, const SimpleTensor<T> &updates, const SimpleTensor<uint32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info)
+SimpleTensor<T> scatter_layer_internal(const SimpleTensor<T> &src, const SimpleTensor<T> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info)
 {
+    // 1. If zero initialization variable is false, copy src data to dst.
     SimpleTensor<T> dst{ out_shape, src.data_type(), 1 };
-
-    // 1. If zero initialization variable is true, fill dst with 0 values. Else copy src data to dst.
-    if(info.zero_initialization)
-    {
-        for (int i = 0; i < src.num_elements(); ++i)
-        {
-            dst[i] = static_cast<T>(0);
-        }
-    }
-    else
+    if(!info.zero_initialization)
     {
         std::copy_n(src.data(), src.num_elements(), dst.data());
     }
 
-    // 2. Get max index of output tensor, then iterate over index tensor.
-    const auto x_bound = dst.shape().x();
+    // Number of elements between each value of the dim being iterated through
+    const unsigned int data_stride = updates.shape().total_size_lower(updates.shape().num_dimensions() - 1);
+    const unsigned int no_output_dims = out_shape.num_dimensions();
 
+    // Calculate output stride at given index for all output dims.
+    std::vector<unsigned int> out_stride_at_idx(no_output_dims);
+    for (unsigned int i = 0 ; i < no_output_dims; i++)
+    {
+        out_stride_at_idx[i] = out_shape.total_size_lower(i);
+    }
 
-    for(int i = 0; i < indices.num_elements(); ++i)
+    const unsigned int indices_x_dim = static_cast<unsigned int>(indices.shape()[0]);
+    const unsigned int indices_y_dim = static_cast<unsigned int>(indices.shape()[1]);
+
+    // 2. Iterate over indices tensor y-dim and replace sections of dst tensor with relevant areas of update tensor.
+    for(unsigned int i = 0; i < indices_y_dim; i++)
     {
-        // 3. Check whether index is out of bounds for dst, if not then apply reduce op.
-        const auto index = indices[i];
-        if (index < x_bound) // Note : index is always >= 0 as datatype is unsigned.
+        // NOTE : Currently, indices.shape() == [X, Y, 1, 1], where  X is the indices dim and Y is the batch dim
+        // Starting index for both the update and indices tensors.
+        const unsigned int update_dim_start = i * data_stride;
+        const unsigned int indices_dim_start = i * indices_x_dim;
+        bool out_of_bounds = false;
+        unsigned int out_offset_acc = 0;
+
+        // Iterate over each indices value for the relevant batch and accumulate the offset.
+        for(unsigned int j = 0; j < indices_x_dim; j++)
+        {
+            // Get first index value with i * indices_x_dim (iterating through y-dim/batch idx), then iterate through x dim by adding k
+            const int index_value = indices[indices_dim_start + j];
+            const unsigned int out_dim = no_output_dims - (j+1);   // Calculate corresponding output dim to current index value.
+            if(index_value < static_cast<int>(out_shape[out_dim]) && index_value >= 0)
+            {
+                out_offset_acc += (index_value * out_stride_at_idx[out_dim]); // offset accumulation
+            }
+            else
+            {
+                out_of_bounds = true;
+                break;
+            }
+        }
+
+        // If not out of bounds, copy update tensor elements to output
+        if(!out_of_bounds)
         {
-            dst[index] = reduce_op(dst[index], updates[i], info.func);
+            for (unsigned int j = 0 ; j < data_stride; j++)
+            {
+                dst[out_offset_acc + j] = reduce_op(dst[out_offset_acc + j], updates[update_dim_start + j], info.func);
+            }
         }
     }
     return dst;
 }
 
 template <typename T>
-SimpleTensor<T> scatter_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &updates, const SimpleTensor<uint32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info)
+SimpleTensor<T> scatter_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info)
 {
     return scatter_layer_internal<T>(src, updates, indices, out_shape, info);
 }
 
-template SimpleTensor<float> scatter_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &updates, const SimpleTensor<uint32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
-
+template SimpleTensor<float> scatter_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<half> scatter_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int32_t> scatter_layer(const SimpleTensor<int32_t> &src, const SimpleTensor<int32_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint32_t> scatter_layer(const SimpleTensor<uint32_t> &src, const SimpleTensor<uint32_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int16_t> scatter_layer(const SimpleTensor<int16_t> &src, const SimpleTensor<int16_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint16_t> scatter_layer(const SimpleTensor<uint16_t> &src, const SimpleTensor<uint16_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int8_t> scatter_layer(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint8_t> scatter_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ScatterLayer.h b/tests/validation/reference/ScatterLayer.h
index dc441a8894..97d5e70b0d 100644
--- a/tests/validation/reference/ScatterLayer.h
+++ b/tests/validation/reference/ScatterLayer.h
@@ -37,10 +37,10 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> scatter_layer_internal(const SimpleTensor<T> &src, const SimpleTensor<T> &update, const SimpleTensor<uint32_t> &indices, const TensorShape &shape, const ScatterInfo &info);
+SimpleTensor<T> scatter_layer_internal(const SimpleTensor<T> &src, const SimpleTensor<T> &update, const SimpleTensor<int32_t> &indices, const TensorShape &shape, const ScatterInfo &info);
 
 template <typename T>
-SimpleTensor<T> scatter_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &update, const SimpleTensor<uint32_t> &indices, const TensorShape &shape, const ScatterInfo &info);
+SimpleTensor<T> scatter_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &update, const SimpleTensor<int32_t> &indices, const TensorShape &shape, const ScatterInfo &info);
 } // namespace reference
 } // namespace validation
 } // namespace test