40 files changed, 1595 insertions, 320 deletions
diff --git a/tests/validation/reference/ActivationLayer.cpp b/tests/validation/reference/ActivationLayer.cpp
index 664b969125..2172362bdd 100644
--- a/tests/validation/reference/ActivationLayer.cpp
+++ b/tests/validation/reference/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "ActivationLayer.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -40,7 +41,7 @@ SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo
     ARM_COMPUTE_UNUSED(oq_info);
 
     // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
+    SimpleTensor<T> dst{src.shape(), src.data_type(), 1};
 
     // Compute reference
     const T a(info.a());
@@ -48,7 +49,7 @@ SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo
 #if defined(_OPENMP)
     #pragma omp parallel for
 #endif /* _OPENMP */
-    for(int i = 0; i < src.num_elements(); ++i)
+    for (int i = 0; i < src.num_elements(); ++i)
     {
         dst[i] = activate_float<T>(src[i], a, b, info.activation());
     }
@@ -57,7 +58,8 @@ SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo
 }
 
 template <>
-SimpleTensor<uint8_t> activation_layer<uint8_t>(const SimpleTensor<uint8_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
+SimpleTensor<uint8_t>
+activation_layer<uint8_t>(const SimpleTensor<uint8_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
 {
     const QuantizationInfo dst_qinfo = oq_info.empty() ? src.quantization_info() : oq_info;
 
@@ -68,7 +70,8 @@ SimpleTensor<uint8_t> activation_layer<uint8_t>(const SimpleTensor<uint8_t> &src
 }
 
 template <>
-SimpleTensor<int8_t> activation_layer<int8_t>(const SimpleTensor<int8_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
+SimpleTensor<int8_t>
+activation_layer<int8_t>(const SimpleTensor<int8_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
 {
     const QuantizationInfo dst_qinfo = oq_info.empty() ? src.quantization_info() : oq_info;
 
@@ -79,7 +82,8 @@ SimpleTensor<int8_t> activation_layer<int8_t>(const SimpleTensor<int8_t> &src, A
 }
 
 template <>
-SimpleTensor<int16_t> activation_layer<int16_t>(const SimpleTensor<int16_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
+SimpleTensor<int16_t>
+activation_layer<int16_t>(const SimpleTensor<int16_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info)
 {
     const QuantizationInfo dst_qinfo = oq_info.empty() ? src.quantization_info() : oq_info;
 
@@ -88,9 +92,14 @@ SimpleTensor<int16_t> activation_layer<int16_t>(const SimpleTensor<int16_t> &src
     SimpleTensor<int16_t> dst     = convert_to_symmetric<int16_t>(dst_tmp, dst_qinfo);
     return dst;
 }
-template SimpleTensor<int32_t> activation_layer(const SimpleTensor<int32_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
-template SimpleTensor<float> activation_layer(const SimpleTensor<float> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
-template SimpleTensor<half> activation_layer(const SimpleTensor<half> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
+template SimpleTensor<int32_t>
+activation_layer(const SimpleTensor<int32_t> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
+template SimpleTensor<float>
+activation_layer(const SimpleTensor<float> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
+template SimpleTensor<half>
+activation_layer(const SimpleTensor<half> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
+template SimpleTensor<bfloat16>
+activation_layer(const SimpleTensor<bfloat16> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ActivationLayer.h b/tests/validation/reference/ActivationLayer.h
index 8aad1af63e..7f896bd696 100644
--- a/tests/validation/reference/ActivationLayer.h
+++ b/tests/validation/reference/ActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020,2022,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_ACTIVATION_LAYER_H
-#define ARM_COMPUTE_TEST_ACTIVATION_LAYER_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_ACTIVATIONLAYER_H
+#define ACL_TESTS_VALIDATION_REFERENCE_ACTIVATIONLAYER_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -40,7 +40,7 @@ inline T activate_float(T x, T a, T b, ActivationLayerInfo::ActivationFunction a
 {
     T ret;
 
-    switch(activation)
+    switch (activation)
     {
         case ActivationLayerInfo::ActivationFunction::ABS:
             ret = std::abs(x);
@@ -61,13 +61,13 @@ inline T activate_float(T x, T a, T b, ActivationLayerInfo::ActivationFunction a
             ret = std::min<T>(a, std::max<T>(b, x));
             break;
         case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-            ret = (x > 0) ? x : a * x;
+            ret = x > static_cast<T>(0) ? x : static_cast<T>(a * x);
             break;
         case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-            ret = std::log(static_cast<T>(1) + std::exp(x));
+            ret = std::log(static_cast<T>(1) + std::exp(static_cast<double>(x)));
             break;
         case ActivationLayerInfo::ActivationFunction::ELU:
-            ret = (x > 0) ? x : a * (std::exp(x) - static_cast<T>(1));
+            ret = x > static_cast<T>(0) ? x : static_cast<T>(a * (std::exp(x) - static_cast<T>(1)));
             break;
         case ActivationLayerInfo::ActivationFunction::SQRT:
             ret = std::sqrt(x);
@@ -82,7 +82,14 @@ inline T activate_float(T x, T a, T b, ActivationLayerInfo::ActivationFunction a
             ret = x;
             break;
         case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-            ret = x * ((std::min(std::max(static_cast<T>(x + 3), static_cast<T>(0.0f)), static_cast<T>(6.0f))) * 0.166666667f);
+            ret = x * ((std::min(std::max(static_cast<T>(x + 3), static_cast<T>(0.0f)), static_cast<T>(6.0f))) *
+                       0.166666667f);
+            break;
+        case ActivationLayerInfo::ActivationFunction::SWISH:
+            ret = static_cast<T>(x) / (static_cast<T>(1) + std::exp(-a * x));
+            break;
+        case ActivationLayerInfo::ActivationFunction::GELU:
+            ret = x * 0.5f * (1 + erf(x / std::sqrt(2.0f)));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported activation function");
@@ -93,9 +100,11 @@ inline T activate_float(T x, T a, T b, ActivationLayerInfo::ActivationFunction a
 }
 
 template <typename T>
-SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo info, const QuantizationInfo &oq_info = QuantizationInfo());
+SimpleTensor<T> activation_layer(const SimpleTensor<T>  &src,
+                                 ActivationLayerInfo     info,
+                                 const QuantizationInfo &oq_info = QuantizationInfo());
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_ACTIVATION_LAYER_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_ACTIVATIONLAYER_H
diff --git a/tests/validation/reference/BatchToSpaceLayer.cpp b/tests/validation/reference/BatchToSpaceLayer.cpp
index 404ee73cac..63d121f59b 100644
--- a/tests/validation/reference/BatchToSpaceLayer.cpp
+++ b/tests/validation/reference/BatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,10 @@
  */
 #include "BatchToSpaceLayer.h"
 
+#include "arm_compute/core/Validate.h"
 #include "tests/validation/Helpers.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 namespace arm_compute
 {
 namespace test
@@ -35,32 +37,37 @@ namespace reference
 {
 // Batch to Space
 template <typename T>
-SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape)
+SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &dst_shape)
 {
-    ARM_COMPUTE_ERROR_ON(block_shape[0] <= 0);
-    ARM_COMPUTE_ERROR_ON(block_shape[1] <= 0);
-    SimpleTensor<T> result(dst_shape, src.data_type());
+    ARM_COMPUTE_ERROR_ON(block_shape[0] < 1);
+    ARM_COMPUTE_ERROR_ON(block_shape[1] < 1);
+    const auto expected_dst_shape = misc::shape_calculator::compute_batch_to_space_shape(DataLayout::NCHW, src.shape(), block_shape[0], block_shape[1], crop_info);
+    ARM_COMPUTE_ERROR_ON(arm_compute::detail::have_different_dimensions(expected_dst_shape, dst_shape, 0));
+    ARM_COMPUTE_UNUSED(expected_dst_shape);
 
-    int        in_pos    = 0;
-    const auto width_in  = static_cast<int>(src.shape()[0]);
-    const auto height_in = static_cast<int>(src.shape()[1]);
-    const auto z_in      = static_cast<int>(src.shape()[2]);
-    const auto batch_in  = static_cast<int>(src.shape()[3]);
+    SimpleTensor<T> result(dst_shape, src.data_type());
+    int             out_pos    = 0;
+    const auto      width_out  = static_cast<int>(dst_shape[0]);
+    const auto      height_out = static_cast<int>(dst_shape[1]);
+    const auto      z_out      = static_cast<int>(dst_shape[2]);
+    const auto      batch_out  = static_cast<int>(dst_shape[3]);
 
-    for(int batch = 0; batch < batch_in; ++batch)
+    for(int batch = 0; batch < batch_out; ++batch)
     {
-        for(int z = 0; z < z_in; ++z)
+        for(int z = 0; z < z_out; ++z)
         {
-            for(int y = 0; y < height_in; ++y)
+            for(int y = 0; y < height_out; ++y)
             {
-                for(int x = 0; x < width_in; ++x)
+                for(int x = 0; x < width_out; ++x)
                 {
-                    const int r       = src.shape()[3] / (block_shape[0] * block_shape[1]);
-                    const int out_x   = (block_shape[0] * x + (batch / r) % block_shape[0]);
-                    const int out_y   = (block_shape[1] * y + (batch / r) / block_shape[0]);
-                    const int out_pos = out_x + dst_shape[0] * out_y + z * dst_shape[0] * dst_shape[1] + (batch % r) * dst_shape[0] * dst_shape[1] * dst_shape[2];
-                    result[out_pos]   = src[in_pos];
-                    ++in_pos;
+                    const int x_c      = x + crop_info.left;
+                    const int y_c      = y + crop_info.top;
+                    const int in_batch = batch + ((x_c % block_shape[0]) + (y_c % block_shape[1]) * (block_shape[0])) * dst_shape[3];
+                    const int in_x     = x_c / block_shape[0];
+                    const int in_y     = y_c / block_shape[1];
+                    const int in_pos   = in_x + src.shape()[0] * in_y + z * src.shape()[0] * src.shape()[1] + in_batch * src.shape()[0] * src.shape()[1] * src.shape()[2];
+                    result[out_pos]    = src[in_pos];
+                    ++out_pos;
                 }
             }
         }
@@ -68,8 +75,8 @@ SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const SimpleTensor<in
 
     return result;
 }
-template SimpleTensor<float> batch_to_space(const SimpleTensor<float> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape);
-template SimpleTensor<half> batch_to_space(const SimpleTensor<half> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape);
+template SimpleTensor<float> batch_to_space(const SimpleTensor<float> &src, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &dst_shape);
+template SimpleTensor<half> batch_to_space(const SimpleTensor<half> &src, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &dst_shape);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/BatchToSpaceLayer.h b/tests/validation/reference/BatchToSpaceLayer.h
index 52556cb53f..a37bfc3373 100644
--- a/tests/validation/reference/BatchToSpaceLayer.h
+++ b/tests/validation/reference/BatchToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_TEST_BATCH_TO_SPACE_LAYER_H
 #define ARM_COMPUTE_TEST_BATCH_TO_SPACE_LAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
 
@@ -36,7 +37,7 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape);
+SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const std::vector<int32_t> &block_shape, const CropInfo &crop_info, const TensorShape &dst_shape);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Conv3D.cpp b/tests/validation/reference/Conv3D.cpp
new file mode 100644
index 0000000000..e4010a507a
--- /dev/null
+++ b/tests/validation/reference/Conv3D.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Conv3D.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "support/AclRequires.h"
+#include "tests/validation/reference/UtilsQuantizedAsymm.h"
+
+// Source/Destination Tensor shape indices (N D H W C)
+constexpr unsigned int batch_dim   = 4u;
+constexpr unsigned int depth_dim   = 3u;
+constexpr unsigned int height_dim  = 2u;
+constexpr unsigned int width_dim   = 1u;
+constexpr unsigned int channel_dim = 0u;
+
+// Weight tensor shape indices (D H W Cin Cout)
+constexpr unsigned int weights_depth_dim  = 4u;
+constexpr unsigned int weights_height_dim = 3u;
+constexpr unsigned int weights_width_dim  = 2u;
+constexpr unsigned int weights_CHin_dim   = 1u;
+constexpr unsigned int weights_CHout_dim  = 0u;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+inline bool is_valid_pixel(int i, int min, int max)
+{
+    return (i >= min && i < max);
+}
+
+// Evaluate the weights against an element in a given tensor.
+template < typename T, typename TB, typename std::enable_if < validation::is_floating_point<T>::value &&validation::is_floating_point<TB>::value, int >::type = 0 >
+T calculate_conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const Size3D &dilation, int batch,
+                   int z_start, int y_start, int x_start, int ch_out, UniformQuantizationInfo oq_info)
+{
+    ARM_COMPUTE_UNUSED(oq_info);
+
+    const unsigned int weights_width  = weights.shape()[weights_width_dim];
+    const unsigned int weights_height = weights.shape()[weights_height_dim];
+    const unsigned int weights_depth  = weights.shape()[weights_depth_dim];
+
+    const unsigned int src_channels = src.shape()[channel_dim];
+    const unsigned int src_width    = src.shape()[width_dim];
+    const unsigned int src_height   = src.shape()[height_dim];
+    const unsigned int src_depth    = src.shape()[depth_dim];
+
+    T total(0);
+    for(unsigned int weight_d = 0; weight_d < weights_depth; ++weight_d)
+    {
+        const int idx_z = z_start + dilation.depth * weight_d;
+        for(unsigned int weight_y = 0; weight_y < weights_height; ++weight_y)
+        {
+            const int idx_y = y_start + dilation.height * weight_y;
+            for(unsigned int weight_x = 0; weight_x < weights_width; ++weight_x)
+            {
+                const int idx_x = x_start + dilation.width * weight_x;
+
+                //Check if the point is within padding
+                const bool is_x_valid       = is_valid_pixel(idx_x, 0, src_width);
+                const bool is_y_valid       = is_valid_pixel(idx_y, 0, src_height);
+                const bool is_z_valid       = is_valid_pixel(idx_z, 0, src_depth);
+                const bool is_invalid_pixel = !(is_x_valid && is_y_valid && is_z_valid);
+                if(is_invalid_pixel)
+                {
+                    continue;
+                }
+
+                for(unsigned int ch_in = 0; ch_in < src_channels; ++ch_in)
+                {
+                    const T *in_ptr = src.data();
+                    const T *w_ptr  = weights.data();
+
+                    const int in_offset     = coord2index(src.shape(), Coordinates{ ch_in, idx_x, idx_y, idx_z, batch });
+                    const int weight_offset = coord2index(weights.shape(), Coordinates{ ch_out, ch_in, weight_x, weight_y, weight_d });
+                    T         input_value   = in_ptr[in_offset];
+                    T         weight_value  = w_ptr[weight_offset];
+                    total += (input_value * weight_value);
+                }
+            }
+        }
+    }
+
+    const TB *b_ptr      = bias.data();
+    TB        bias_value = b_ptr[ch_out];
+
+    return total + bias_value;
+}
+
+template < typename T, typename TB, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+T calculate_conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const Size3D &dilation, int batch,
+                   int z_start, int y_start, int x_start, int ch_out, UniformQuantizationInfo oq_info)
+{
+    const unsigned int weights_width  = weights.shape()[weights_width_dim];
+    const unsigned int weights_height = weights.shape()[weights_height_dim];
+    const unsigned int weights_depth  = weights.shape()[weights_depth_dim];
+
+    const unsigned int src_channels = src.shape()[channel_dim];
+    const unsigned int src_width    = src.shape()[width_dim];
+    const unsigned int src_height   = src.shape()[height_dim];
+    const unsigned int src_depth    = src.shape()[depth_dim];
+
+    const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
+    const int   input_offset   = -iq_info.offset;
+    const float input_scale    = iq_info.scale;
+    int         weights_offset = -wq_info.offset;
+    float       weights_scale  = wq_info.scale;
+    const int   output_offset  = oq_info.offset;
+    const float output_scale   = oq_info.scale;
+
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+    const float multiplier        = input_scale * weights_scale / output_scale;
+    arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+
+    int32_t total(0);
+    for(unsigned int weight_d = 0; weight_d < weights_depth; ++weight_d)
+    {
+        const int idx_z = z_start + dilation.depth * weight_d;
+        for(unsigned int weight_y = 0; weight_y < weights_height; ++weight_y)
+        {
+            const int idx_y = y_start + dilation.height * weight_y;
+            for(unsigned int weight_x = 0; weight_x < weights_width; ++weight_x)
+            {
+                const int idx_x = x_start + dilation.width * weight_x;
+
+                //Check if the point is within padding
+                const bool is_x_valid       = is_valid_pixel(idx_x, 0, src_width);
+                const bool is_y_valid       = is_valid_pixel(idx_y, 0, src_height);
+                const bool is_z_valid       = is_valid_pixel(idx_z, 0, src_depth);
+                const bool is_invalid_pixel = !(is_x_valid && is_y_valid && is_z_valid);
+                if(is_invalid_pixel)
+                {
+                    continue;
+                }
+
+                for(unsigned int ch_in = 0; ch_in < src_channels; ++ch_in)
+                {
+                    const T *in_ptr = src.data();
+                    const T *w_ptr  = weights.data();
+
+                    const int in_offset     = coord2index(src.shape(), Coordinates{ ch_in, idx_x, idx_y, idx_z, batch });
+                    const int weight_offset = coord2index(weights.shape(), Coordinates{ ch_out, ch_in, weight_x, weight_y, weight_d });
+                    T         input_value   = in_ptr[in_offset];
+                    T         weight_value  = w_ptr[weight_offset];
+                    total += ((input_value + input_offset) * (weight_value + weights_offset));
+                }
+            }
+        }
+    }
+
+    const TB *b_ptr      = bias.data();
+    TB        bias_value = b_ptr[ch_out];
+
+    total += bias_value;
+
+    return validation::quantize_down_scale_by_fixedpoint(total, output_multiplier, output_shift, output_offset,
+                                                         std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max());
+}
+} // namespace
+
+template <typename T, typename TB>
+SimpleTensor<T> conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const Conv3dInfo &conv3d_info)
+{
+    // Compute reference
+    const unsigned int batch_size     = src.shape()[batch_dim];
+    const unsigned int dst_width      = dst.shape()[width_dim];
+    const unsigned int dst_height     = dst.shape()[height_dim];
+    const unsigned int dst_depth      = dst.shape()[depth_dim];
+    const unsigned int src_channels   = src.shape()[channel_dim];
+    const unsigned int weights_out_ch = weights.shape()[weights_CHout_dim];
+    const unsigned int dst_channels   = dst.shape()[channel_dim];
+    const size_t       pad_left       = conv3d_info.padding.left;
+    const size_t       pad_top        = conv3d_info.padding.top;
+    const size_t       pad_front      = conv3d_info.padding.front;
+    const size_t       stride_x       = conv3d_info.stride.x();
+    const size_t       stride_y       = conv3d_info.stride.y();
+    const size_t       stride_z       = conv3d_info.stride.z();
+
+    const TensorShape dst_shape = arm_compute::misc::shape_calculator::compute_conv3d_shape(src.shape(), weights.shape(), conv3d_info);
+
+    ARM_COMPUTE_UNUSED(src_channels, weights_out_ch, dst_channels, dst_shape, weights_CHin_dim);
+    // Number of batches of source and destination tensors must match.
+    ARM_COMPUTE_ERROR_ON(src.shape()[batch_dim] != dst.shape()[batch_dim]);
+    // Input channels in the source and weights must match.
+    ARM_COMPUTE_ERROR_ON(src_channels != weights.shape()[weights_CHin_dim]);
+    // Weight channels in the destination and weights must match.
+    ARM_COMPUTE_ERROR_ON(weights_out_ch != dst_channels);
+    // Bias must match the number of destination channels.
+    ARM_COMPUTE_ERROR_ON(bias.shape()[0] != dst_channels);
+    // Compare given dst tensor shape with expected shape.
+    ARM_COMPUTE_ERROR_ON(dst.shape() != dst_shape);
+
+    for(unsigned int batch = 0; batch < batch_size; ++batch)
+    {
+        for(unsigned int z_out = 0; z_out < dst_depth; ++z_out)
+        {
+            const int z_start = (z_out * stride_z) - pad_front;
+            for(unsigned int y_out = 0; y_out < dst_height; ++y_out)
+            {
+                const int y_start = (y_out * stride_y) - pad_top;
+                for(unsigned int x_out = 0; x_out < dst_width; ++x_out)
+                {
+                    const int x_start = (x_out * stride_x) - pad_left;
+                    for(unsigned int ch_out = 0; ch_out < dst_channels; ++ch_out)
+                    {
+                        T *out_ptr = dst.data();
+
+                        const int out_offset = coord2index(dst.shape(), Coordinates{ ch_out, x_out, y_out, z_out, batch });
+                        out_ptr[out_offset]  = calculate_conv3d<T, TB>(src, weights, bias, conv3d_info.dilation, batch, z_start, y_start, x_start, ch_out, dst.quantization_info().uniform());
+                    }
+                }
+            }
+        }
+    }
+    return dst;
+}
+
+template SimpleTensor<float> conv3d(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, SimpleTensor<float> &dst,
+                                    const Conv3dInfo &conv3d_info);
+template SimpleTensor<half> conv3d(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, SimpleTensor<half> &dst,
+                                   const Conv3dInfo &conv3d_info);
+template SimpleTensor<uint8_t> conv3d(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<uint8_t> &dst,
+                                      const Conv3dInfo &conv3d_info);
+template SimpleTensor<int8_t> conv3d(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<int8_t> &dst,
+                                     const Conv3dInfo &conv3d_info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/Conv3D.h b/tests/validation/reference/Conv3D.h
new file mode 100644
index 0000000000..e3674f4bfb
--- /dev/null
+++ b/tests/validation/reference/Conv3D.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_CONV3D_LAYER_H
+#define ARM_COMPUTE_TEST_CONV3D_LAYER_H
+
+#include "Utils.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename TB>
+SimpleTensor<T> conv3d(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst,
+                       const Conv3dInfo &conv3d_info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_CONV3D_LAYER_H */
diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
index 1666e3857b..b67e88e839 100644
--- a/tests/validation/reference/Convolution3d.h
+++ b/tests/validation/reference/Convolution3d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "support/Requires.h"
+#include "support/AclRequires.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
diff --git a/tests/validation/reference/DFT.cpp b/tests/validation/reference/DFT.cpp
index fd126c7d73..2b03c270ac 100644
--- a/tests/validation/reference/DFT.cpp
+++ b/tests/validation/reference/DFT.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -400,10 +400,10 @@ SimpleTensor<T> conv2d_dft(const SimpleTensor<T> &src, const SimpleTensor<T> &w,
     auto              padded_src = pad_layer(src, padding_in);
 
     // Flip weights
-    std::vector<uint32_t>  axis_v = { 0, 1 };
-    SimpleTensor<uint32_t> axis{ TensorShape(2U), DataType::U32 };
+    std::vector<uint32_t> axis_v = { 0, 1 };
+    SimpleTensor<int32_t> axis{ TensorShape(2U), DataType::S32 };
     std::copy(axis_v.begin(), axis_v.begin() + axis.shape().x(), axis.data());
-    auto flipped_w = reverse(w, axis);
+    auto flipped_w = reverse(w, axis, /* use_inverted_axis */ false);
 
     // Pad weights to have the same size as input
     const PaddingList paddings_w = { { 0, src.shape()[0] - 1 }, { 0, src.shape()[1] - 1 } };
diff --git a/tests/validation/reference/DepthConvertLayer.cpp b/tests/validation/reference/DepthConvertLayer.cpp
index 94c719ade7..3f88897f8e 100644
--- a/tests/validation/reference/DepthConvertLayer.cpp
+++ b/tests/validation/reference/DepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -165,7 +165,7 @@ template SimpleTensor<half> depth_convert(const SimpleTensor<int32_t> &src, Data
 template SimpleTensor<float> depth_convert(const SimpleTensor<int32_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 
 // BFLOAT16
-template SimpleTensor<float> depth_convert(const SimpleTensor<bfloat16> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<bfloat16> depth_convert(const SimpleTensor<bfloat16> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 
 // F16
 template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<half> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
@@ -186,6 +186,25 @@ template SimpleTensor<int32_t> depth_convert(const SimpleTensor<float> &src, Dat
 template SimpleTensor<half> depth_convert(const SimpleTensor<float> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 template SimpleTensor<bfloat16> depth_convert(const SimpleTensor<float> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 
+// S64
+template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int8_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint16_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int16_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint32_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int32_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<half> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<float> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+
+// U64
+template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int8_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint16_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int16_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<uint32_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<int32_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<half> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<float> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DequantizationLayer.cpp b/tests/validation/reference/DequantizationLayer.cpp
index 64a89aa6a0..67d69c2c38 100644
--- a/tests/validation/reference/DequantizationLayer.cpp
+++ b/tests/validation/reference/DequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,12 @@ TOut dequantize(int16_t val, const UniformQuantizationInfo qinfo, DataType dt)
     ARM_COMPUTE_UNUSED(dt);
     return static_cast<TOut>(dequantize_qsymm16(val, qinfo));
 }
+template <typename TOut>
+TOut dequantize(int32_t val, const UniformQuantizationInfo qinfo, DataType dt)
+{
+    ARM_COMPUTE_UNUSED(dt);
+    return static_cast<TOut>(dequantize_s32(val, qinfo));
+}
 } // namespace
 template <typename TOut, typename TIn>
 SimpleTensor<TOut> dequantization_layer(const SimpleTensor<TIn> &src)
@@ -115,6 +121,7 @@ template SimpleTensor<half> dequantization_layer(const SimpleTensor<int8_t> &src
 template SimpleTensor<float> dequantization_layer(const SimpleTensor<int8_t> &src);
 template SimpleTensor<half> dequantization_layer(const SimpleTensor<int16_t> &src);
 template SimpleTensor<float> dequantization_layer(const SimpleTensor<int16_t> &src);
+template SimpleTensor<float> dequantization_layer(const SimpleTensor<int32_t> &src);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ElementwiseOperations.cpp b/tests/validation/reference/ElementwiseOperations.cpp
index f22c84e153..edbbab8600 100644
--- a/tests/validation/reference/ElementwiseOperations.cpp
+++ b/tests/validation/reference/ElementwiseOperations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,15 +74,6 @@ T arithm_op(ArithmeticOperation op, T src1, T src2, ConvertPolicy convert_policy
         case ArithmeticOperation::DIV:
         {
             val = (static_cast<intermediate_type>(src1) / static_cast<intermediate_type>(src2));
-            if(std::is_integral<T>::value)
-            {
-                // Implement flooring division
-                val = (src2 == 0) ? 0 : val;
-                if(static_cast<int32_t>(src1) % static_cast<int32_t>(src2) != 0 && ((src1 < 0) != (src2 < 0)))
-                {
-                    --val;
-                }
-            }
             break;
         }
         case ArithmeticOperation::POWER:
diff --git a/tests/validation/reference/ElementwiseUnary.cpp b/tests/validation/reference/ElementwiseUnary.cpp
index 5333b53c15..558f9d24fc 100644
--- a/tests/validation/reference/ElementwiseUnary.cpp
+++ b/tests/validation/reference/ElementwiseUnary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,8 @@
  * SOFTWARE.
  */
 #include "ElementwiseUnary.h"
-
+#include "tests/validation/Helpers.h"
+#include "utils/TypePrinter.h"
 namespace arm_compute
 {
 namespace test
@@ -32,10 +33,8 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> elementwise_unary(const SimpleTensor<T> &src, ElementWiseUnary op)
+SimpleTensor<T> elementwise_unary(const SimpleTensor<T> &src, SimpleTensor<T> &dst, ElementWiseUnary op)
 {
-    SimpleTensor<T> dst(src.shape(), src.data_type());
-
     for(int i = 0; i < src.num_elements(); ++i)
     {
         switch(op)
@@ -65,13 +64,107 @@ SimpleTensor<T> elementwise_unary(const SimpleTensor<T> &src, ElementWiseUnary o
                 ARM_COMPUTE_ERROR("Not implemented");
         }
     }
+    return dst;
+}
+template <>
+SimpleTensor<int8_t> elementwise_unary(const SimpleTensor<int8_t> &src, SimpleTensor<int8_t> &dst, ElementWiseUnary op)
+{
+    if(dst.data_type() == DataType::QASYMM8_SIGNED)
+    {
+        SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+        SimpleTensor<float> dst_tmp(src.shape(), DataType::F32);
+        for(int i = 0; i < src.num_elements(); ++i)
+        {
+            switch(op)
+            {
+                case ElementWiseUnary::RSQRT:
+                    if(src_tmp[i] != 0)
+                    {
+                        dst_tmp[i] = 1.f / std::sqrt(src_tmp[i]);
+                    }
+                    else
+                    {
+                       // rsqrt(0) give 'inf' so set to the maximum in int8: 127
+                       dst_tmp[i] = (127.0f - dst.quantization_info().uniform().offset)  * dst.quantization_info().uniform().scale ;
+                    }
+                    break;
+
+                case ElementWiseUnary::LOG:
+                    if(src_tmp[i] != 0)
+                    {
+                        dst_tmp[i] = std::log(src_tmp[i]);
+                    }
+                    else
+                    {
+                       dst_tmp[i] = (-128.0f - dst.quantization_info().uniform().offset)  * dst.quantization_info().uniform().scale ;
+                    }
+                    break;
+
+                default:
+                    elementwise_unary(src_tmp, dst_tmp, op);
+                    break;
+            }
+        }
+        dst = convert_to_asymmetric<int8_t>(dst_tmp, dst.quantization_info());
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+    return dst;
+}
+template <>
+SimpleTensor<uint8_t> elementwise_unary(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, ElementWiseUnary op)
+{
+    if(dst.data_type() == DataType::QASYMM8)
+    {
+        SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+        SimpleTensor<float> dst_tmp(src.shape(), DataType::F32);
+        for(int i = 0; i < src.num_elements(); ++i)
+        {
+            switch(op)
+            {
+                case ElementWiseUnary::RSQRT:
+                    if(src_tmp[i] != 0)
+                    {
+                        dst_tmp[i] = 1.f / std::sqrt(src_tmp[i]);
+                    }
+                    else
+                    {
+                        // rsqrt(0) give 'inf' so set to the maximum in uint8: 255
+                        dst_tmp[i] = (255.0f - dst.quantization_info().uniform().offset)* dst.quantization_info().uniform().scale;
+                    }
+                    break;
 
+                case ElementWiseUnary::LOG:
+                    if(src_tmp[i] != 0)
+                    {
+                        dst_tmp[i] = std::log(src_tmp[i]);
+                    }
+                    else
+                    {
+                        dst_tmp[i] = -dst.quantization_info().uniform().offset * dst.quantization_info().uniform().scale;
+                    }
+                    break;
+
+                default:
+                    elementwise_unary(src_tmp, dst_tmp, op);
+                    break;
+            }
+        }
+        dst = convert_to_asymmetric<uint8_t>(dst_tmp, dst.quantization_info());
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
     return dst;
 }
 
-template SimpleTensor<float> elementwise_unary(const SimpleTensor<float> &src, ElementWiseUnary op);
-template SimpleTensor<half> elementwise_unary(const SimpleTensor<half> &src, ElementWiseUnary op);
-template SimpleTensor<int32_t> elementwise_unary(const SimpleTensor<int32_t> &src, ElementWiseUnary op);
+template SimpleTensor<float> elementwise_unary(const SimpleTensor<float> &src, SimpleTensor<float> &dst, ElementWiseUnary op);
+template SimpleTensor<half> elementwise_unary(const SimpleTensor<half> &src, SimpleTensor<half> &dst, ElementWiseUnary op);
+template SimpleTensor<int32_t> elementwise_unary(const SimpleTensor<int32_t> &src, SimpleTensor<int32_t> &dst, ElementWiseUnary op);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ElementwiseUnary.h b/tests/validation/reference/ElementwiseUnary.h
index be4a229a5b..ae7a49bce4 100644
--- a/tests/validation/reference/ElementwiseUnary.h
+++ b/tests/validation/reference/ElementwiseUnary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> elementwise_unary(const SimpleTensor<T> &src, ElementWiseUnary op);
+SimpleTensor<T> elementwise_unary(const SimpleTensor<T> &src, SimpleTensor<T> &dst, ElementWiseUnary op);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/FullyConnectedLayer.cpp b/tests/validation/reference/FullyConnectedLayer.cpp
index 21333958f8..af30e9ee54 100644
--- a/tests/validation/reference/FullyConnectedLayer.cpp
+++ b/tests/validation/reference/FullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,7 +123,7 @@ SimpleTensor<T> fully_connected_layer(const SimpleTensor<T> &src, const SimpleTe
     // Create reference
     SimpleTensor<T> dst{ TensorShape{ dst_shape }, src.data_type(), 1, out_quant_info };
 
-    // Sanity checks
+    // Health checks
     const int          num_batch_dimensions = std::max(0, static_cast<int>(dst_shape.num_dimensions()) - 1);
     const int          num_input_dimensions = src.shape().num_dimensions() - num_batch_dimensions;
     const unsigned int linear_input_size    = src.shape().total_size_lower(num_input_dimensions);
diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index 6b3aa390f0..d513343796 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
 
 namespace arm_compute
 {
@@ -35,10 +36,11 @@ namespace validation
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
+SimpleTensor<T>
+gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
 {
     // Create reference
-    SimpleTensor<T> dst{ c.shape(), c.data_type(), 1 };
+    SimpleTensor<T> dst{c.shape(), c.data_type(), 1};
 
     // Compute reference
     const int M = a.shape().y();
@@ -50,30 +52,47 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
     const int a_stride_z = K * M;
     const int a_stride_w = K * M * D;
 
-    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
-    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+    const int b_stride_z =
+        b.shape().num_dimensions() > 2
+        ? N * K
+        : 0; // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    int b_stride_w =
+        b.shape().num_dimensions() > 3
+        ? K * N * D
+        : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    // Note: There are 3 gemm types: batched-gemm, multi-gemm, and batched of multi-gemms. The third dimension of tensor b is overloaded when tensor b has exactly 3 dimensions:
+    // it can be either number of batches or multis. Batched-GEMM computation is detected only when the third dimension of "a" and "c" tensors is 1 and the number of dimensions is 4
+    const bool is_batched_gemm = b.shape().num_dimensions() == 3 && a.shape().num_dimensions() == 4 &&
+                                 c.shape().num_dimensions() == 4 && a.shape()[2] == 1 && c.shape()[2] == 1;
+
+    // Batched-GEMM
+    if (is_batched_gemm)
+    {
+        b_stride_w = b_stride_z;
+    }
 
     const int c_stride_z = N * M;
     const int c_stride_w = N * M * D;
 
-#if defined(_OPENMP) && !( defined(__arm__) && defined(__ANDROID__))
+#if defined(_OPENMP) && !(defined(__arm__) && defined(__ANDROID__))
     #pragma omp parallel for collapse(2)
 #endif /* _OPENMP */
-    for(int w = 0; w < W; ++w)
+    for (int w = 0; w < W; ++w)
     {
-        for(int depth = 0; depth < D; ++depth)
+        for (int depth = 0; depth < D; ++depth)
         {
             const int base_addr_a = depth * a_stride_z + w * a_stride_w;
             const int base_addr_b = depth * b_stride_z + w * b_stride_w;
             const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int row = 0; row < M; ++row)
+            for (int row = 0; row < M; ++row)
             {
-                for(int col = 0; col < N; ++col)
+                for (int col = 0; col < N; ++col)
                 {
                     T acc(0);
 
-                    for(int k = 0; k < K; ++k)
+                    for (int k = 0; k < K; ++k)
                     {
                         acc += a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N];
                     }
@@ -89,11 +108,12 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
 }
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
+SimpleTensor<T> gemm_mixed_precision(
+    const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
 {
     // GEMM mixed-precision combines F32 accumulators with F16 multiplications
     // Create reference
-    SimpleTensor<T> dst{ c.shape(), c.data_type(), 1 };
+    SimpleTensor<T> dst{c.shape(), c.data_type(), 1};
 
     // Compute reference
     const int M = a.shape().y();
@@ -105,36 +125,54 @@ SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTenso
     const int a_stride_z = K * M;
     const int a_stride_w = K * M * D;
 
-    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
-    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+    const int b_stride_z =
+        b.shape().num_dimensions() > 2
+        ? N * K
+        : 0; // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    int b_stride_w =
+        b.shape().num_dimensions() > 3
+        ? K * N * D
+        : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    // Note: There are 3 gemm types: batched-gemm, multi-gemm, and batched of multi-gemms. The third dimension of tensor b is overloaded when tensor b has exactly 3 dimensions:
+    // it can be either number of batches or multis. Batched-GEMM computation is detected only when the third dimension of "a" and "c" tensors is 1 and the number of dimensions is 4
+    const bool is_batched_gemm = b.shape().num_dimensions() == 3 && a.shape().num_dimensions() == 4 &&
+                                 c.shape().num_dimensions() == 4 && a.shape()[2] == 1 && c.shape()[2] == 1;
+
+    // Batched-GEMM
+    if (is_batched_gemm)
+    {
+        b_stride_w = b_stride_z;
+    }
 
     const int c_stride_z = N * M;
     const int c_stride_w = N * M * D;
 
-#if defined(_OPENMP) && !( defined(__arm__) && defined(__ANDROID__))
+#if defined(_OPENMP) && !(defined(__arm__) && defined(__ANDROID__))
     #pragma omp parallel for collapse(2)
 #endif /* _OPENMP */
-    for(int w = 0; w < W; ++w)
+    for (int w = 0; w < W; ++w)
     {
-        for(int depth = 0; depth < D; ++depth)
+        for (int depth = 0; depth < D; ++depth)
         {
             const int base_addr_a = depth * a_stride_z + w * a_stride_w;
             const int base_addr_b = depth * b_stride_z + w * b_stride_w;
             const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int row = 0; row < M; ++row)
+            for (int row = 0; row < M; ++row)
             {
-                for(int col = 0; col < N; ++col)
+                for (int col = 0; col < N; ++col)
                 {
                     float acc(0);
 
-                    for(int k = 0; k < K; ++k)
+                    for (int k = 0; k < K; ++k)
                     {
                         acc += static_cast<float>(a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N]);
                     }
 
                     // Finalize the result: alpha * A * B + beta * C
-                    dst[base_addr_c + col + row * N] = static_cast<T>(alpha * acc + beta * c[base_addr_c + col + row * N]);
+                    dst[base_addr_c + col + row * N] =
+                        static_cast<T>(alpha * acc + beta * c[base_addr_c + col + row * N]);
                 }
             }
         }
@@ -143,8 +181,21 @@ SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTenso
     return dst;
 }
 
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+void gemm_accumulate(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta, SimpleTensor<T> &dst)
+{
+    // Compute reference
+    SimpleTensor<T> dst_gemm = gemm(a, b, c, alpha, beta);
+    reference::arithmetic_operation<T>(reference::ArithmeticOperation::ADD, dst, dst_gemm, dst, ConvertPolicy::SATURATE);
+}
+
+template SimpleTensor<bfloat16> gemm(const SimpleTensor<bfloat16> &a, const SimpleTensor<bfloat16> &b, const SimpleTensor<bfloat16> &c, float alpha, float beta);
 template SimpleTensor<float> gemm(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta);
 template SimpleTensor<half> gemm(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
+
+template void gemm_accumulate(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta, SimpleTensor<float> &dst);
+template void gemm_accumulate(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta, SimpleTensor<half> &dst);
+
 template SimpleTensor<half> gemm_mixed_precision(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/GEMM.h b/tests/validation/reference/GEMM.h
index 5feaeda584..1b97570122 100644
--- a/tests/validation/reference/GEMM.h
+++ b/tests/validation/reference/GEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMM_H
-#define ARM_COMPUTE_TEST_GEMM_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_GEMM_H
+#define ACL_TESTS_VALIDATION_REFERENCE_GEMM_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -41,8 +41,11 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta);
 
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+void gemm_accumulate(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta, SimpleTensor<T> &dst);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMM_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_GEMM_H
diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 1615b51e73..30c577d850 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "GEMMLowp.h"
 
 #include "arm_compute/core/Types.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
 #include "support/ToolchainSupport.h"
@@ -230,6 +231,13 @@ SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, c
     return c;
 }
 
+template <typename T_out, typename T_in, typename T_in_1>
+void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<T_in> &a, const SimpleTensor<T_in_1> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<T_out> &dst)
+{
+    SimpleTensor<T_out> dst_gemm = gemmlowp_matrix_multiply_core<T_out, T_in, T_in_1>(a, b, shape_c, a_offset, b_offset);
+    reference::arithmetic_operation<T_out>(reference::ArithmeticOperation::ADD, dst, dst_gemm, dst, ConvertPolicy::SATURATE);
+}
+
 // used to validate assembly kernels which don't know anything about offsets
 template <typename T1, typename T2, typename T3>
 SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c)
@@ -336,6 +344,8 @@ template SimpleTensor<int8_t> gemmlowp_quantize_down_scale(const SimpleTensor<in
                                                            std::vector<int32_t> result_shift, int32_t min, int32_t max);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
+template void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<int32_t> &dst);
+template void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<int32_t> &dst);
 template SimpleTensor<int32_t> gemmlowp<int32_t, int8_t, int8_t>(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, uint8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
 template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, int8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
diff --git a/tests/validation/reference/GEMMLowp.h b/tests/validation/reference/GEMMLowp.h
index 99015d71fb..6e471fdad1 100644
--- a/tests/validation/reference/GEMMLowp.h
+++ b/tests/validation/reference/GEMMLowp.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMMLOWP_H
-#define ARM_COMPUTE_TEST_GEMMLOWP_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_GEMMLOWP_H
+#define ACL_TESTS_VALIDATION_REFERENCE_GEMMLOWP_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -38,6 +38,9 @@ namespace reference
 template <typename T1, typename T2, typename T3>
 SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
 
+template <typename T1, typename T2, typename T3>
+void gemmlowp_matrix_multiply_core_accumulate(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset, SimpleTensor<T1> &dst_);
+
 template <typename T1, typename T2, typename T3 = T2>
 SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c);
 
@@ -71,4 +74,4 @@ SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_float(const SimpleTensor<TIn>
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMLOWP_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_GEMMLOWP_H
diff --git a/tests/validation/reference/Gather.cpp b/tests/validation/reference/Gather.cpp
index 93ac09cf95..c90c04f8cc 100644
--- a/tests/validation/reference/Gather.cpp
+++ b/tests/validation/reference/Gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,27 +39,56 @@ namespace reference
 template <typename T>
 SimpleTensor<T> gather(const SimpleTensor<T> &src, const SimpleTensor<uint32_t> &indices, uint32_t actual_axis)
 {
-    const auto       *indices_ptr = static_cast<const uint32_t *>(indices.data());
     const TensorShape dst_shape   = arm_compute::misc::shape_calculator::compute_gather_shape(src.shape(), indices.shape(), actual_axis);
     SimpleTensor<T>   dst(dst_shape, src.data_type());
 
+    const auto        src_ptr     = static_cast<const T *>(src.data());
+    const auto        indices_ptr = static_cast<const uint32_t *>(indices.data());
+    const auto        dst_ptr     = static_cast<T *>(dst.data());
+
+    const uint32_t index_limit = src.shape()[actual_axis];
+
     Window win;
     win.use_tensor_dimensions(dst_shape);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        Coordinates offset;
-        for(unsigned int dim = 0; dim < id.num_dimensions(); ++dim)
+
+    execute_window_loop(win, [&](const Coordinates &dst_coords) {
+        const auto dst_addr = coords2index(dst.shape(), dst_coords);
+
+        // Calculate the coordinates of the index value.
+        Coordinates idx_coords;
+
+        for(size_t i = 0; i < indices.shape().num_dimensions(); ++i)
         {
-            if(dim == actual_axis)
+            idx_coords.set(i, dst_coords[i + actual_axis]);
+        }
+
+        const auto index = indices_ptr[coords2index(indices.shape(), idx_coords)];
+
+        if(index < index_limit)
+        {
+            // Calculate the coordinates of the source data.
+            Coordinates src_coords;
+
+            for(size_t i = 0; i < actual_axis; ++i)
             {
-                offset.set(dim, indices_ptr[id[dim]]);
+                src_coords.set(i, dst_coords[i]);
             }
-            else
+
+            src_coords.set(actual_axis, index);
+
+            for(size_t i = actual_axis + 1; i < src.shape().num_dimensions(); ++i)
             {
-                offset.set(dim, id[dim]);
+                src_coords.set(i, dst_coords[i + indices.shape().num_dimensions() - 1]);
             }
+
+            // Copy the data.
+            const auto src_addr = coords2index(src.shape(), src_coords);
+            dst_ptr[dst_addr] = src_ptr[src_addr];
+        }
+        else
+        {
+            dst_ptr[dst_addr] = 0;
         }
-        *reinterpret_cast<T *>(dst(id)) = *reinterpret_cast<const T *>(src(offset));
     });
 
     return dst;
@@ -72,4 +101,4 @@ template SimpleTensor<uint8_t> gather(const SimpleTensor<uint8_t> &src, const Si
 } // namespace reference
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/reference/IndirectConv2dAddressPrecalculation.cpp b/tests/validation/reference/IndirectConv2dAddressPrecalculation.cpp
new file mode 100644
index 0000000000..7500560c91
--- /dev/null
+++ b/tests/validation/reference/IndirectConv2dAddressPrecalculation.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "IndirectConv2dAddressPrecalculation.h"
+
+#include "arm_compute/core/Types.h"
+
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+SimpleTensor<int32_t> indirect_conv2d_addr_precalculation(const TensorShape &shape_conv_src, const TensorShape &shape_conv_wei, const TensorShape &shape_conv_dst, const TensorShape &shape_dst,
+                                                          const PadStrideInfo &conv_info)
+{
+    SimpleTensor<int32_t> out{ shape_dst, DataType::S32 };
+
+    constexpr unsigned int width_idx = 1;
+    constexpr unsigned int heigh_idx = 2;
+
+    const int src_conv_width  = static_cast<int32_t>(shape_conv_src[width_idx]); // NHWC
+    const int src_conv_height = static_cast<int32_t>(shape_conv_src[heigh_idx]); // NHWC
+    const int dst_conv_width  = static_cast<int32_t>(shape_conv_dst[width_idx]); // NHWC
+    const int wei_conv_width  = static_cast<int32_t>(shape_conv_wei[width_idx]); // NHWC
+    const int wei_conv_height = static_cast<int32_t>(shape_conv_wei[heigh_idx]); // NHWC
+    const int dst_width       = static_cast<int32_t>(shape_dst[0]);
+    const int dst_height      = static_cast<int32_t>(shape_dst[1]);
+    const int dst_batch       = static_cast<int32_t>(shape_dst[2]);
+    const int ks              = wei_conv_width * wei_conv_height;
+    const int stride_x        = static_cast<int32_t>(conv_info.stride().first);
+    const int stride_y        = static_cast<int32_t>(conv_info.stride().second);
+    const int pad_left        = static_cast<int32_t>(conv_info.pad_left());
+    const int pad_top         = static_cast<int32_t>(conv_info.pad_top());
+
+    const int m0 = dst_width / ks;
+
+    for(int z = 0; z < dst_batch; ++z)
+    {
+        for(int y = 0; y < dst_height; ++y)
+        {
+            const int mout = y * m0;
+            for(int ki = 0; ki < ks; ++ki)
+            {
+                const int xk = ki % wei_conv_width;
+                const int yk = ki / wei_conv_width;
+                for(int mi = 0; mi < m0; ++mi)
+                {
+                    int xi = ((mout + mi) % dst_conv_width) * stride_x;
+                    int yi = ((mout + mi) / dst_conv_width) * stride_y;
+                    xi -= pad_left;
+                    yi -= pad_top;
+                    const int x_s = xi + xk;
+                    const int y_s = yi + yk;
+                    int       my  = x_s + y_s * src_conv_width;
+                    my            = my + z * src_conv_width * src_conv_height;
+                    my            = x_s >= 0 ? my : -1;
+                    my            = x_s < src_conv_width ? my : -1;
+                    my            = y_s >= 0 ? my : -1;
+                    my            = y_s < src_conv_height ? my : -1;
+
+                    const unsigned int addr_out = mi + ki * m0 + y * (dst_width) + z * (dst_width * dst_height);
+                    out[addr_out]               = my;
+                }
+            }
+        }
+    }
+
+    return out;
+}
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/tests/validation/reference/IndirectConv2dAddressPrecalculation.h b/tests/validation/reference/IndirectConv2dAddressPrecalculation.h
new file mode 100644
index 0000000000..f4a90dfd9f
--- /dev/null
+++ b/tests/validation/reference/IndirectConv2dAddressPrecalculation.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_H
+#define ARM_COMPUTE_TEST_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_H
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+SimpleTensor<int32_t> indirect_conv2d_addr_precalculation(const TensorShape &shape_conv_src, const TensorShape &shape_conv_wei, const TensorShape &shape_conv_out, const TensorShape &shape_out,
+                                                          const PadStrideInfo &conv_info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_H */
+\ No newline at end of file
diff --git a/tests/validation/reference/MeanStdDevNormalizationLayer.cpp b/tests/validation/reference/MeanStdDevNormalizationLayer.cpp
index 0a23fa19bb..a7c8a784d9 100644
--- a/tests/validation/reference/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/reference/MeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,6 +63,15 @@ SimpleTensor<T> mean_std_normalization_layer(const SimpleTensor<T> &src, float e
     return dst;
 }
 
+template <>
+SimpleTensor<uint8_t> mean_std_normalization_layer(const SimpleTensor<uint8_t> &src, float epsilon)
+{
+    SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float>   dst_tmp = mean_std_normalization_layer<float>(src_tmp, epsilon);
+    SimpleTensor<uint8_t> dst     = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
+    return dst;
+}
+
 template SimpleTensor<float> mean_std_normalization_layer(const SimpleTensor<float> &src, float epsilon);
 template SimpleTensor<half> mean_std_normalization_layer(const SimpleTensor<half> &src, float epsilon);
 } // namespace reference
diff --git a/tests/validation/reference/Permute.cpp b/tests/validation/reference/Permute.cpp
index 6f122b1bf5..7aa3011d8f 100644
--- a/tests/validation/reference/Permute.cpp
+++ b/tests/validation/reference/Permute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "Permute.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -42,11 +43,11 @@ SimpleTensor<T> permute(const SimpleTensor<T> &src, PermutationVector perm)
     permute(dst_shape, perm);
 
     // Create reference
-    SimpleTensor<T> dst{ dst_shape, src.data_type(), src.num_channels(), src.quantization_info() };
+    SimpleTensor<T> dst{dst_shape, src.data_type(), src.num_channels(), src.quantization_info()};
 
     // Compute reference
     const uint32_t num_elements = src.num_elements();
-    for(uint32_t i = 0; i < num_elements; ++i)
+    for (uint32_t i = 0; i < num_elements; ++i)
     {
         const Coordinates src_coords = index2coord(src.shape(), i);
         Coordinates       dst_coords = src_coords;
@@ -58,13 +59,14 @@ SimpleTensor<T> permute(const SimpleTensor<T> &src, PermutationVector perm)
     return dst;
 }
 
-template SimpleTensor<int8_t> permute(const SimpleTensor<int8_t> &src, PermutationVector perm);
-template SimpleTensor<uint8_t> permute(const SimpleTensor<uint8_t> &src, PermutationVector perm);
-template SimpleTensor<int16_t> permute(const SimpleTensor<int16_t> &src, PermutationVector perm);
+template SimpleTensor<int8_t>   permute(const SimpleTensor<int8_t> &src, PermutationVector perm);
+template SimpleTensor<uint8_t>  permute(const SimpleTensor<uint8_t> &src, PermutationVector perm);
+template SimpleTensor<int16_t>  permute(const SimpleTensor<int16_t> &src, PermutationVector perm);
 template SimpleTensor<uint16_t> permute(const SimpleTensor<uint16_t> &src, PermutationVector perm);
 template SimpleTensor<uint32_t> permute(const SimpleTensor<uint32_t> &src, PermutationVector perm);
-template SimpleTensor<float> permute(const SimpleTensor<float> &src, PermutationVector perm);
-template SimpleTensor<half> permute(const SimpleTensor<half> &src, PermutationVector perm);
+template SimpleTensor<float>    permute(const SimpleTensor<float> &src, PermutationVector perm);
+template SimpleTensor<half>     permute(const SimpleTensor<half> &src, PermutationVector perm);
+template SimpleTensor<bfloat16> permute(const SimpleTensor<bfloat16> &src, PermutationVector perm);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Pooling3dLayer.cpp b/tests/validation/reference/Pooling3dLayer.cpp
new file mode 100644
index 0000000000..2e8f3a0b92
--- /dev/null
+++ b/tests/validation/reference/Pooling3dLayer.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "Pooling3dLayer.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+using namespace arm_compute::misc::shape_calculator;
+
+template <typename T>
+SimpleTensor<T> pooling_3d_layer_internal(const SimpleTensor<T> &src, const Pooling3dLayerInfo &pool3d_info, SimpleTensor<uint32_t> *indices)
+{
+    TensorShape     pooled_shape = compute_pool3d_shape(src.shape(), pool3d_info);
+    SimpleTensor<T> dst{ pooled_shape, src.data_type(), 1 };
+
+    if(indices != nullptr)
+    {
+        *indices = SimpleTensor<uint32_t> { pooled_shape, DataType::U32, 1 };
+    }
+
+    const int idx_channel = 0;
+    const int idx_width   = 1;
+    const int idx_height  = 2;
+    const int idx_depth   = 3;
+    const int idx_batch   = 4;
+
+    const int pool_size_width  = pool3d_info.is_global_pooling ? src.shape()[idx_width] : pool3d_info.pool_size.width;
+    const int pool_size_height = pool3d_info.is_global_pooling ? src.shape()[idx_height] : pool3d_info.pool_size.height;
+    const int pool_size_depth  = pool3d_info.is_global_pooling ? src.shape()[idx_depth] : pool3d_info.pool_size.depth;
+
+    const int pool_stride_width  = static_cast<int>(pool3d_info.stride.width);
+    const int pool_stride_height = static_cast<int>(pool3d_info.stride.height);
+    const int pool_stride_depth  = static_cast<int>(pool3d_info.stride.depth);
+
+    const int pad_left  = static_cast<int>(pool3d_info.padding.left);
+    const int pad_top   = static_cast<int>(pool3d_info.padding.top);
+    const int pad_front = static_cast<int>(pool3d_info.padding.front);
+
+    const int pad_right  = static_cast<int>(pool3d_info.padding.right);
+    const int pad_bottom = static_cast<int>(pool3d_info.padding.bottom);
+    const int pad_back   = static_cast<int>(pool3d_info.padding.back);
+
+    const int num_channels = static_cast<int>(src.shape()[idx_channel]);
+    const int num_batches  = static_cast<int>(src.shape()[idx_batch]);
+
+    ARM_COMPUTE_ERROR_ON(num_channels != static_cast<int>(dst.shape()[idx_channel]));
+    ARM_COMPUTE_ERROR_ON(num_batches != static_cast<int>(dst.shape()[idx_batch]));
+
+    const int w_src = static_cast<int>(src.shape()[idx_width]);
+    const int h_src = static_cast<int>(src.shape()[idx_height]);
+    const int d_src = static_cast<int>(src.shape()[idx_depth]);
+    const int w_dst = static_cast<int>(dst.shape()[idx_width]);
+    const int h_dst = static_cast<int>(dst.shape()[idx_height]);
+    const int d_dst = static_cast<int>(dst.shape()[idx_depth]);
+
+    const bool exclude_padding = pool3d_info.exclude_padding;
+
+    const int height_stride_src = num_channels * w_src;
+    const int depth_stride_src  = height_stride_src * h_src;
+    const int batch_stride_src  = depth_stride_src * d_src;
+    const int height_stride_dst = num_channels * w_dst;
+    const int depth_stride_dst  = height_stride_dst * h_dst;
+    const int batch_stride_dst  = depth_stride_dst * d_dst;
+
+    for(int b = 0; b < num_batches; ++b)
+    {
+        const int batch_offset_dst = b * batch_stride_dst;
+        const int batch_offset_src = b * batch_stride_src;
+        for(int c = 0; c < num_channels; ++c)
+        {
+            for(int d = 0; d < d_dst; ++d)
+            {
+                const int depth_offset_dst = d * depth_stride_dst;
+                for(int h = 0; h < h_dst; ++h)
+                {
+                    const int height_offset_dst = h * height_stride_dst;
+                    for(int w = 0; w < w_dst; ++w)
+                    {
+                        int wstart = w * pool_stride_width - pad_left;
+                        int hstart = h * pool_stride_height - pad_top;
+                        int dstart = d * pool_stride_depth - pad_front;
+                        int wend   = std::min(wstart + pool_size_width, w_src + pad_right);
+                        int hend   = std::min(hstart + pool_size_height, h_src + pad_bottom);
+                        int dend   = std::min(dstart + pool_size_depth, d_src + pad_back);
+
+                        // this may not be equal to pool_w * pool_h * pool_d because of
+                        // DimensionRoundingType choice (CEIL)
+                        int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+
+                        // limit [start, end) to [0, w_src)
+                        wstart = std::max(wstart, 0);
+                        hstart = std::max(hstart, 0);
+                        dstart = std::max(dstart, 0);
+                        wend   = std::min(wend, w_src);
+                        hend   = std::min(hend, h_src);
+                        dend   = std::min(dend, d_src);
+
+                        auto max_val = -std::numeric_limits<T>::infinity();
+                        int  max_index{ 0 };
+                        T    avg_val = static_cast<T>(0.f);
+                        T    l2_val  = static_cast<T>(0.f);
+
+                        if(exclude_padding)
+                        {
+                            pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                        }
+
+                        for(int z = dstart; z < dend; ++z)
+                        {
+                            const int depth_offset_src = z * depth_stride_src;
+                            for(int y = hstart; y < hend; ++y)
+                            {
+                                const int height_offset_src = y * height_stride_src;
+                                for(int x = wstart; x < wend; ++x)
+                                {
+                                    const auto val = static_cast<T>(
+                                                         src[batch_offset_src + depth_offset_src + height_offset_src + x * num_channels + c]);
+                                    if(val > max_val)
+                                    {
+                                        max_val   = val;
+                                        max_index = coord2index(src.shape(), Coordinates(c, x, y, z, 0));
+                                    }
+
+                                    avg_val += val;
+                                    l2_val += val * val;
+                                }
+                            }
+                        }
+
+                        avg_val /= pool_size;
+                        l2_val = static_cast<T>(std::sqrt(l2_val / pool_size));
+
+                        int dst_index = batch_offset_dst + depth_offset_dst + height_offset_dst + w * num_channels + c;
+                        switch(pool3d_info.pool_type)
+                        {
+                            case PoolingType::MAX:
+                                dst[dst_index] = static_cast<T>(max_val);
+                                break;
+                            case PoolingType::AVG:
+                                dst[dst_index] = static_cast<T>(avg_val);
+                                break;
+                            case PoolingType::L2:
+                                dst[dst_index] = static_cast<T>(l2_val);
+                                break;
+                            default:
+                                ARM_COMPUTE_ERROR("Pooling Type should be either MAX, AVG or L2");
+                        }
+
+                        if(indices != nullptr)
+                        {
+                            (*indices)[dst_index] = max_index;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return dst;
+}
+
+template SimpleTensor<float> pooling_3d_layer(const SimpleTensor<float> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices);
+template SimpleTensor<half> pooling_3d_layer(const SimpleTensor<half> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices);
+
+template <typename T>
+SimpleTensor<T> pooling_3d_layer(const SimpleTensor<T> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
+{
+    ARM_COMPUTE_UNUSED(output_qinfo);
+    return pooling_3d_layer_internal<T>(src, pool3d_info, indices);
+}
+
+template <>
+SimpleTensor<int8_t> pooling_3d_layer<int8_t>(const SimpleTensor<int8_t> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
+{
+    SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float> dst_tmp = pooling_3d_layer_internal<float>(src_tmp, pool3d_info, indices);
+    return convert_to_asymmetric<int8_t>(dst_tmp, output_qinfo);
+}
+
+template <>
+SimpleTensor<uint8_t> pooling_3d_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices)
+{
+    SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float> dst_tmp = pooling_3d_layer_internal<float>(src_tmp, pool3d_info, indices);
+    return convert_to_asymmetric<uint8_t>(dst_tmp, output_qinfo);
+}
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/Pooling3dLayer.h b/tests/validation/reference/Pooling3dLayer.h
new file mode 100644
index 0000000000..481a0d3024
--- /dev/null
+++ b/tests/validation/reference/Pooling3dLayer.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_POOL3D_LAYER_H
+#define ARM_COMPUTE_TEST_POOL3D_LAYER_H
+
+#include "Utils.h"
+#include "arm_compute/core/Types.h"
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> pooling_3d_layer_internal(const SimpleTensor<T> &src, const Pooling3dLayerInfo &pool3d_info, SimpleTensor<uint32_t> *indices = nullptr);
+
+template <typename T>
+SimpleTensor<T> pooling_3d_layer(const SimpleTensor<T> &src, const Pooling3dLayerInfo &pool3d_info, const QuantizationInfo &output_qinfo = QuantizationInfo(),
+                                 SimpleTensor<uint32_t> *indices = nullptr);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_POOL3D_LAYER_H */
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 5f4edfe49c..bf7bd0c1df 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,6 @@ using namespace arm_compute::misc::shape_calculator;
 template <typename T, typename ACC_T, typename std::enable_if<is_floating_point<T>::value, int>::type>
 SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info, SimpleTensor<uint32_t> *indices, DataLayout data_layout)
 {
-    ARM_COMPUTE_ERROR_ON(info.is_global_pooling && (src.shape().x() != src.shape().y()));
     // Create reference
     SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info), src.data_type(), 1 };
     auto            pooled_shape = compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info);
@@ -84,20 +83,28 @@ SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const Pooling
                     {
                         int wstart   = w * pool_stride_x - pad_left;
                         int hstart   = h * pool_stride_y - pad_top;
+
+                        // Used to calculate kernel indices
+                        int  kh_start = std::max(0, -hstart);
+                        int  kw_start = std::max(0, -wstart);
+                        int  max_ker_index{ 0 };
+
                         int wend     = std::min(wstart + pool_size_x, w_src);
                         int hend     = std::min(hstart + pool_size_y, h_src);
                         wstart       = std::max(wstart, 0);
                         hstart       = std::max(hstart, 0);
-                        auto max_val = std::numeric_limits<ACC_T>::lowest();
+                        auto max_val = info.use_inf_as_limit ? -std::numeric_limits<ACC_T>::infinity() : std::numeric_limits<ACC_T>::lowest();
                         int  max_index{ 0 };
-                        for(int y = hstart; y < hend; ++y)
+
+                        for(int y = hstart, kh = kh_start; y < hend; ++y, ++kh)
                         {
-                            for(int x = wstart; x < wend; ++x)
+                            for(int x = wstart, kw = kw_start; x < wend; ++x, ++kw)
                             {
                                 const auto val = static_cast<ACC_T>(src[b * z_src * h_src * w_src + r * h_src * w_src + y * w_src + x]);
                                 if(val > max_val)
                                 {
-                                    max_val = val;
+                                    max_val   = val;
+                                    max_ker_index = pool_size_x * (kh) + (kw);
                                     if(data_layout == DataLayout::NCHW)
                                     {
                                         max_index = coord2index(src.shape(), Coordinates(x, y, r, 0));
@@ -113,7 +120,7 @@ SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const Pooling
                         dst[b * z_dst * h_dst * w_dst + r * h_dst * w_dst + h * w_dst + w] = static_cast<T>(max_val);
                         if(indices)
                         {
-                            (*indices)[b * z_dst * h_dst * w_dst + r * h_dst * w_dst + h * w_dst + w] = max_index;
+                            (*indices)[b * z_dst * h_dst * w_dst + r * h_dst * w_dst + h * w_dst + w] = (info.use_kernel_indices) ? max_ker_index : max_index;
                         }
                     }
                 }
diff --git a/tests/validation/reference/QuantizationLayer.cpp b/tests/validation/reference/QuantizationLayer.cpp
index 27665375c3..ad7ba7ac43 100644
--- a/tests/validation/reference/QuantizationLayer.cpp
+++ b/tests/validation/reference/QuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index ffb79f86c5..c189bc2d47 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "ReductionOperation.h"
-
 #include "tests/validation/Helpers.h"
 
 #include <algorithm>
@@ -39,7 +38,7 @@ namespace reference
 namespace
 {
 template <typename T, typename OT>
-OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, int stride)
+OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, int stride, RoundingPolicy policy)
 {
     using type = typename std::remove_cv<OT>::type;
     T res;
@@ -99,7 +98,14 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in
         }
         if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0)
         {
-            int_res /= reduce_elements;
+            // Only use rounding in aarch64 to be consistent with kernel
+#ifdef __aarch64__
+            // Divide in float format, then rounded to nearest and implicitly cast back to int
+            int_res = round(static_cast<float>(int_res) / static_cast<float>(reduce_elements), policy);
+#else  // defined(__aarch64__)
+            ARM_COMPUTE_UNUSED(policy);
+            int_res /= reduce_elements; // Legacy compatibility
+#endif // __aarch64
         }
         res = static_cast<type>(int_res);
     }
@@ -175,12 +181,12 @@ OT reduce_operation_arg_min_max(const T *ptr, int reduce_elements, ReductionOper
 } // namespace
 
 template <typename T, typename OT>
-SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
+SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                             DataType output_type, RoundingPolicy policy)
 {
     // Create reference
-    const bool         is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
-    DataType           output_data_type = is_arg_min_max ? DataType::S32 : src.data_type();
-    SimpleTensor<OT>   dst{ dst_shape, output_data_type, 1, src.quantization_info() };
+    const bool         is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+    SimpleTensor<OT>   dst{ dst_shape, output_type, 1, src.quantization_info() };
     const unsigned int src_width    = src.shape().x();
     const unsigned int src_height   = src.shape().y();
     const unsigned int src_depth    = src.shape().z();
@@ -197,7 +203,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
                 const T *src_row_ptr = src.data() + du * reduce_elems;
                 dst[du]              = is_arg_min_max ?
                                        reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, 1) :
-                                       reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1);
+                                       reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1, policy);
             }
         }
         break;
@@ -213,7 +219,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
                     const T *src_row_ptr = src.data() + in_offset;
                     dst[out_offset]       = is_arg_min_max ?
                                             reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width) :
-                                            reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width);
+                                            reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width, policy);
                 }
             }
         }
@@ -232,7 +238,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
                         const T *src_row_ptr = src.data() + in_offset;
                         dst[out_offset]       = is_arg_min_max ?
                                                 reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height) :
-                                                reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height);
+                                                reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height, policy);
                     }
                 }
             }
@@ -254,7 +260,7 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
                             const T *src_row_ptr = src.data() + in_offset;
                             dst[out_offset]       = is_arg_min_max ?
                                                     reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth) :
-                                                    reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
+                                                    reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth, policy);
                         }
                     }
                 }
@@ -269,74 +275,89 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
 }
 
 template <typename T, typename OT>
-SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                     DataType output_type, QuantizationInfo quantization_info_output, RoundingPolicy policy)
 {
     ARM_COMPUTE_UNUSED(quantization_info_output);
-    return compute_reduction_operation<T, OT>(src, dst_shape, axis, op);
+    return compute_reduction_operation<T, OT>(src, dst_shape, axis, op, output_type, policy);
 }
 
 template <>
-SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
+SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                          DataType output_type, QuantizationInfo quantization_info_output, RoundingPolicy policy)
 {
     if(src.data_type() == DataType::QASYMM8)
     {
         // If the operation is MEAN_SUM, we can directly use the uint8 implementation without taking into account scale and offset
         if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output)
         {
-            return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op);
+            return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op, output_type, policy);
         }
         else
         {
             SimpleTensor<float> src_f = convert_from_asymmetric(src);
-            SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op);
+            SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op, output_type);
             return convert_to_asymmetric<uint8_t>(dst_f, quantization_info_output);
         }
     }
     else
     {
-        return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op);
+        return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op, output_type, policy);
     }
 }
 
 template <>
-SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
+SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis,
+                                         ReductionOperation op, DataType output_type, QuantizationInfo quantization_info_output, RoundingPolicy policy)
 {
     if(src.data_type() == DataType::QASYMM8_SIGNED)
     {
         // If the operation is MEAN_SUM, we can directly use the int8 implementation without taking into account scale and offset
         if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output)
         {
-            return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op);
+            return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op, output_type, policy);
         }
         else
         {
             SimpleTensor<float> src_f = convert_from_asymmetric(src);
-            SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op);
+            SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op, output_type);
             return convert_to_asymmetric<int8_t>(dst_f, quantization_info_output);
         }
     }
     else
     {
-        return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op);
+        return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op, output_type, policy);
     }
 }
 
 template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                 QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                 DataType output_type = DataType::S32, QuantizationInfo quantization_info_output = QuantizationInfo(),
+                                                 RoundingPolicy policy = RoundingPolicy::TO_ZERO);
+
 template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                DataType         output_type              = DataType::S32,
+                                                QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
+
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int32_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   DataType         output_type              = DataType::S32,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 
+template SimpleTensor<int64_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                                   DataType output_type = DataType::S32, QuantizationInfo quantization_info_output = QuantizationInfo(),
+                                                   RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h
index 9c9e721b29..fb2e7a7093 100644
--- a/tests/validation/reference/ReductionOperation.h
+++ b/tests/validation/reference/ReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_TEST_REDUCTION_OPERATION_H
 #define ARM_COMPUTE_TEST_REDUCTION_OPERATION_H
 
+#include "arm_compute/core/Rounding.h"
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
 
@@ -36,8 +37,8 @@ namespace validation
 namespace reference
 {
 template <typename T, typename OT>
-SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                     QuantizationInfo quantization_info_output = QuantizationInfo());
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, DataType output_type = DataType::S32,
+                                     QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Remap.cpp b/tests/validation/reference/Remap.cpp
deleted file mode 100644
index 33c5a7de68..0000000000
--- a/tests/validation/reference/Remap.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "Remap.h"
-
-#include "Utils.h"
-#include "tests/validation/Helpers.h"
-
-#include <algorithm>
-#include <array>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> remap(const SimpleTensor<T> &in, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<T> &valid_mask, InterpolationPolicy policy, BorderMode border_mode,
-                      T constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(border_mode == BorderMode::REPLICATE, "BorderMode not supported");
-    SimpleTensor<T> out(in.shape(), in.data_type());
-    ARM_COMPUTE_ERROR_ON(out.num_elements() != map_x.num_elements());
-    const int      width        = in.shape().x();
-    const int      height       = in.shape().y();
-    const uint32_t num_elements = out.num_elements();
-    for(uint32_t idx = 0; idx < num_elements; idx++)
-    {
-        const Coordinates id_out = index2coord(out.shape(), idx);
-        valid_mask[idx]          = 1;
-        Coordinates src_idx      = id_out; // need to setup all coordinates and not just xy
-        if((0 <= map_y[idx]) && (map_y[idx] < height) && (0 <= map_x[idx]) && (map_x[idx] < width))
-        {
-            switch(policy)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                {
-                    src_idx.set(0, static_cast<int>(std::floor(map_x[idx])));
-                    src_idx.set(1, static_cast<int>(std::floor(map_y[idx])));
-                    out[idx] = in[coord2index(in.shape(), src_idx)];
-                    break;
-                }
-                case InterpolationPolicy::BILINEAR:
-                {
-                    (valid_bilinear_policy(map_x[idx], map_y[idx], width, height, border_mode)) ?
-                    out[idx]        = bilinear_policy(in, src_idx, map_x[idx], map_y[idx], border_mode, constant_border_value) :
-                                      valid_mask[idx] = 0;
-                    break;
-                }
-                case InterpolationPolicy::AREA:
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-                    break;
-            }
-        }
-        else
-        {
-            if(border_mode == BorderMode::UNDEFINED)
-            {
-                valid_mask[idx] = 0;
-            }
-            else
-            {
-                switch(policy)
-                {
-                    case InterpolationPolicy::NEAREST_NEIGHBOR:
-                        out[idx] = constant_border_value;
-                        break;
-                    case InterpolationPolicy::BILINEAR:
-                        out[idx] = bilinear_policy(in, src_idx, map_x[idx], map_y[idx], border_mode, constant_border_value);
-                        break;
-                    case InterpolationPolicy::AREA:
-                    default:
-                        break;
-                }
-            }
-        }
-    }
-
-    return out;
-}
-
-template SimpleTensor<uint8_t> remap(const SimpleTensor<uint8_t> &src, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<uint8_t> &valid_mask, InterpolationPolicy policy,
-                                     BorderMode border_mode,
-                                     uint8_t    constant_border_value);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Reorder.cpp b/tests/validation/reference/Reorder.cpp
new file mode 100644
index 0000000000..8abb372596
--- /dev/null
+++ b/tests/validation/reference/Reorder.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Reorder.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row.  This same number of values
+ * are then read from the next <IntBy-1> rows.  Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, typename d_type, arm_gemm::VLType vlt>
+struct Transform_ref
+{
+    template <typename TOut, typename TIn>
+    static void Transform(TOut &out, const TIn in, const int stride,
+                          const int y0, const int ymax, const int x0, const int xmax)
+    {
+        // NOTE: This code is disabled to avoid the call to get_vector_length(), so templated transforms will not be
+        // correct for SVE.  This is not an issue as we have specializations for all SVE cases.
+        // For SVE cases we multiply the interleave factor by the vector length.
+        // const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length<TOut>() / BlockBy : 1);
+        const unsigned int IntBy     = tIntBy;
+        int                out_index = 0;
+
+        const int n_whole_y_blocks = (ymax - y0) / IntBy;
+        const int y_remainders     = (ymax - y0) % IntBy;
+        const int n_y_blocks       = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+        const int x_remainders     = (xmax - x0) % BlockBy;
+        const int n_x_blocks       = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+        // "Y" loop: advance down the rows of the source IntBy rows at a time.
+        // Set up fill_rows to show the number rows to copy from, and blank_rows
+        // for the number of blank rows to add.
+        for(int y_block = 0; y_block < n_y_blocks; y_block++)
+        {
+            const int fill_rows  = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+            const int blank_rows = IntBy - fill_rows;
+
+            const int y_base = y0 + (y_block * IntBy);
+
+            // So now advance along this block of rows, BlockBy columns at a time.
+            for(int x_block = 0; x_block < n_x_blocks; x_block++)
+            {
+                const int fill_cols  = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+                const int blank_cols = BlockBy - fill_cols;
+
+                const int x_base = x0 + (x_block * BlockBy);
+
+                for(int row = 0; row < fill_rows; row++)
+                {
+                    for(int col = 0; col < fill_cols; col++)
+                    {
+                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
+                        if(Transposed)
+                        {
+                            out[out_index] = in[(x_base + col) * stride + y_base + row];
+                            out_index++;
+                        }
+                        else
+                        {
+                            out[out_index] = in[(y_base + row) * stride + x_base + col];
+                            out_index++;
+                        }
+                    }
+                    // "col" tail - row is in range but column is out of range.
+                    for(int col = 0; col < blank_cols; col++)
+                    {
+                        out[out_index] = 0;
+                        out_index++;
+                    }
+                }
+                // "row" tail - row is out of range so fill with zeros always.
+                const d_type zeroval = 0;
+                const int    pads    = blank_rows * (fill_cols + blank_cols);
+
+                for(int i = 0; i < pads; i++)
+                {
+                    out[out_index] = zeroval;
+                }
+
+                out_index += pads;
+            }
+        }
+    }
+};
+
+template <typename T>
+SimpleTensor<T> reorder_layer(const SimpleTensor<T> &src, const TensorShape &output_shape, WeightFormat output_wf)
+{
+    SimpleTensor<T> dst{ output_shape, src.data_type() };
+    const int       cols = src.shape()[0];
+    const int       rows = src.shape()[1];
+
+    switch(output_wf)
+    {
+        case WeightFormat::OHWIo4:
+        {
+            Transform_ref<4, 1, true, sizeof(float), sizeof(float), float, arm_gemm::VLType::None>::Transform<SimpleTensor<T> &, SimpleTensor<T>>(dst, src, rows, 0, rows, 0, cols);
+            break;
+        }
+        case WeightFormat::OHWIo8:
+        {
+            Transform_ref<8, 1, true, sizeof(float), sizeof(float), float, arm_gemm::VLType::None>::Transform<SimpleTensor<T> &, SimpleTensor<T>>(dst, src, rows, 0, rows, 0, cols);
+            break;
+        }
+        default:
+            break;
+    }
+
+    return dst;
+}
+
+template SimpleTensor<float> reorder_layer(const SimpleTensor<float> &src, const TensorShape &output_shape, WeightFormat output_wf);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/Remap.h b/tests/validation/reference/Reorder.h
index 0726f75965..94ee5078f8 100644
--- a/tests/validation/reference/Remap.h
+++ b/tests/validation/reference/Reorder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_REMAP_H
-#define ARM_COMPUTE_TEST_REMAP_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_REORDER
+#define ACL_TESTS_VALIDATION_REFERENCE_REORDER
 
 #include "tests/SimpleTensor.h"
+#include "tests/Types.h"
 
 namespace arm_compute
 {
@@ -35,10 +36,9 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> remap(const SimpleTensor<T> &in, SimpleTensor<float> &map_x, SimpleTensor<float> &map_y, SimpleTensor<T> &valid_mask, InterpolationPolicy policy, BorderMode border_mode,
-                      T constant_border_value = 0);
+SimpleTensor<T> reorder_layer(const SimpleTensor<T> &src, const TensorShape &output_shape, WeightFormat output_wf);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REMAP_H */
+#endif /* ACL_TESTS_VALIDATION_REFERENCE_REORDER */
diff --git a/tests/validation/reference/ReshapeLayer.cpp b/tests/validation/reference/ReshapeLayer.cpp
index daea001be6..30a58dd65b 100644
--- a/tests/validation/reference/ReshapeLayer.cpp
+++ b/tests/validation/reference/ReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017,2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,14 +44,15 @@ SimpleTensor<T> reshape_layer(const SimpleTensor<T> &src, const TensorShape &out
     return dst;
 }
 
-template SimpleTensor<uint8_t> reshape_layer(const SimpleTensor<uint8_t> &src, const TensorShape &output_shape);
-template SimpleTensor<int8_t> reshape_layer(const SimpleTensor<int8_t> &src, const TensorShape &output_shape);
+template SimpleTensor<uint8_t>  reshape_layer(const SimpleTensor<uint8_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int8_t>   reshape_layer(const SimpleTensor<int8_t> &src, const TensorShape &output_shape);
 template SimpleTensor<uint16_t> reshape_layer(const SimpleTensor<uint16_t> &src, const TensorShape &output_shape);
-template SimpleTensor<int16_t> reshape_layer(const SimpleTensor<int16_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int16_t>  reshape_layer(const SimpleTensor<int16_t> &src, const TensorShape &output_shape);
 template SimpleTensor<uint32_t> reshape_layer(const SimpleTensor<uint32_t> &src, const TensorShape &output_shape);
-template SimpleTensor<int32_t> reshape_layer(const SimpleTensor<int32_t> &src, const TensorShape &output_shape);
-template SimpleTensor<half> reshape_layer(const SimpleTensor<half> &src, const TensorShape &output_shape);
-template SimpleTensor<float> reshape_layer(const SimpleTensor<float> &src, const TensorShape &output_shape);
+template SimpleTensor<int32_t>  reshape_layer(const SimpleTensor<int32_t> &src, const TensorShape &output_shape);
+template SimpleTensor<half>     reshape_layer(const SimpleTensor<half> &src, const TensorShape &output_shape);
+template SimpleTensor<float>    reshape_layer(const SimpleTensor<float> &src, const TensorShape &output_shape);
+template SimpleTensor<bfloat16> reshape_layer(const SimpleTensor<bfloat16> &src, const TensorShape &output_shape);
 /** [ReshapeLayer] **/
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/Reverse.cpp b/tests/validation/reference/Reverse.cpp
index c6c4614278..7924f900d1 100644
--- a/tests/validation/reference/Reverse.cpp
+++ b/tests/validation/reference/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,9 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t> &axis)
+SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis)
 {
+    ARM_COMPUTE_ERROR_ON(src.shape().num_dimensions() > 4);
     ARM_COMPUTE_ERROR_ON(axis.shape().num_dimensions() > 1);
     ARM_COMPUTE_ERROR_ON(axis.shape().x() > 4);
 
@@ -48,10 +49,32 @@ SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t>
     const unsigned int depth   = src.shape()[2];
     const unsigned int batches = src.shape()[3];
 
+    const int rank = src.shape().num_dimensions();
+
     std::array<bool, 4> to_reverse = { { false, false, false, false } };
     for(int i = 0; i < axis.num_elements(); ++i)
     {
-        to_reverse[axis[i]] = true;
+        int axis_i = axis[i];
+
+        // The values of axis tensor must be between [-rank, rank-1].
+        if((axis_i < -rank) || (axis_i >= rank))
+        {
+            ARM_COMPUTE_ERROR("the values of the axis tensor must be within [-rank, rank-1].");
+        }
+
+        // In case of negative axis value i.e targeted axis(i) = rank + axis(i)
+        if(axis_i < 0)
+        {
+            axis_i = rank + axis_i;
+        }
+
+        // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis
+        if(use_inverted_axis)
+        {
+            axis_i = (rank - 1) - axis_i;
+        }
+
+        to_reverse[axis_i] = true;
     }
 
     const uint32_t num_elements = src.num_elements();
@@ -73,9 +96,9 @@ SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t>
     return dst;
 }
 
-template SimpleTensor<uint8_t> reverse(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint32_t> &axis);
-template SimpleTensor<half> reverse(const SimpleTensor<half> &src, const SimpleTensor<uint32_t> &axis);
-template SimpleTensor<float> reverse(const SimpleTensor<float> &src, const SimpleTensor<uint32_t> &axis);
+template SimpleTensor<uint8_t> reverse(const SimpleTensor<uint8_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<half> reverse(const SimpleTensor<half> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<float> reverse(const SimpleTensor<float> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Reverse.h b/tests/validation/reference/Reverse.h
index 4a28da7270..30926b05a5 100644
--- a/tests/validation/reference/Reverse.h
+++ b/tests/validation/reference/Reverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_REVERSE_H
-#define ARM_COMPUTE_TEST_REVERSE_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_REVERSE_H
+#define ACL_TESTS_VALIDATION_REFERENCE_REVERSE_H
 
 #include "tests/SimpleTensor.h"
 
@@ -35,9 +35,9 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<uint32_t> &axis);
+SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis = false);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REVERSE_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_REVERSE_H
diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index 71e98fd776..2f429cb29b 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #include "Scale.h"
 
 #include "Utils.h"
-#include "arm_compute/core/utils/misc/Utility.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -183,14 +182,15 @@ SimpleTensor<T> scale_core(const SimpleTensor<T> &in, float scale_x, float scale
 
 template <typename T>
 SimpleTensor<T> scale(const SimpleTensor<T> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value,
-                      SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners)
+                      SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info)
 {
+    ARM_COMPUTE_UNUSED(output_quantization_info);
     return scale_core<T>(src, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy, ceil_policy_scale, align_corners);
 }
 
 template <>
 SimpleTensor<uint8_t> scale(const SimpleTensor<uint8_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value,
-                            SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners)
+                            SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info)
 {
     SimpleTensor<uint8_t> dst;
     if(src.quantization_info().uniform().scale != 0.f)
@@ -198,7 +198,7 @@ SimpleTensor<uint8_t> scale(const SimpleTensor<uint8_t> &src, float scale_x, flo
         SimpleTensor<float> src_tmp                 = convert_from_asymmetric(src);
         float               constant_border_value_f = dequantize_qasymm8(constant_border_value, src.quantization_info());
         SimpleTensor<float> dst_tmp                 = scale_core<float>(src_tmp, scale_x, scale_y, policy, border_mode, constant_border_value_f, sampling_policy, ceil_policy_scale, align_corners);
-        dst                                         = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
+        dst                                         = convert_to_asymmetric<uint8_t>(dst_tmp, output_quantization_info);
     }
     else
     {
@@ -209,7 +209,7 @@ SimpleTensor<uint8_t> scale(const SimpleTensor<uint8_t> &src, float scale_x, flo
 
 template <>
 SimpleTensor<int8_t> scale(const SimpleTensor<int8_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, int8_t constant_border_value,
-                           SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners)
+                           SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info)
 {
     SimpleTensor<int8_t> dst;
     if(src.quantization_info().uniform().scale != 0.f)
@@ -217,7 +217,7 @@ SimpleTensor<int8_t> scale(const SimpleTensor<int8_t> &src, float scale_x, float
         SimpleTensor<float> src_tmp                 = convert_from_asymmetric(src);
         float               constant_border_value_f = dequantize_qasymm8_signed(constant_border_value, src.quantization_info());
         SimpleTensor<float> dst_tmp                 = scale_core<float>(src_tmp, scale_x, scale_y, policy, border_mode, constant_border_value_f, sampling_policy, ceil_policy_scale, align_corners);
-        dst                                         = convert_to_asymmetric<int8_t>(dst_tmp, src.quantization_info());
+        dst                                         = convert_to_asymmetric<int8_t>(dst_tmp, output_quantization_info);
     }
     else
     {
@@ -227,11 +227,11 @@ SimpleTensor<int8_t> scale(const SimpleTensor<int8_t> &src, float scale_x, float
 }
 
 template SimpleTensor<int16_t> scale(const SimpleTensor<int16_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, int16_t constant_border_value,
-                                     SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners);
+                                     SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info);
 template SimpleTensor<half> scale(const SimpleTensor<half> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, half constant_border_value,
-                                  SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners);
+                                  SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info);
 template SimpleTensor<float> scale(const SimpleTensor<float> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, float constant_border_value,
-                                   SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners);
+                                   SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners, QuantizationInfo output_quantization_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Scale.h b/tests/validation/reference/Scale.h
index c66af8d94e..c32c07d1c0 100644
--- a/tests/validation/reference/Scale.h
+++ b/tests/validation/reference/Scale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@ namespace reference
 {
 template <typename T>
 SimpleTensor<T> scale(const SimpleTensor<T> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value = 0,
-                      SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool ceil_policy_scale = false, bool align_corners = false);
+                      SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool ceil_policy_scale = false, bool align_corners = false, QuantizationInfo output_quantization_info = QuantizationInfo());
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ScatterLayer.cpp b/tests/validation/reference/ScatterLayer.cpp
new file mode 100644
index 0000000000..55c48a9002
--- /dev/null
+++ b/tests/validation/reference/ScatterLayer.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ScatterLayer.h"
+#include "tests/validation/Helpers.h"
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+
+template <typename T>
+T reduce_op(const T &current,const T &update,const ScatterFunction func)
+{
+    switch(func)
+    {
+        case ScatterFunction::Update:
+            return update;
+            break;
+        case ScatterFunction::Add:
+            return current + update;
+            break;
+        case ScatterFunction::Sub:
+            return current - update;
+            break;
+        case ScatterFunction::Max:
+            return std::max(current, update);
+            break;
+        case ScatterFunction::Min:
+            return std::min(current, update);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported Scatter function");
+            break;
+    }
+}
+
+template float reduce_op(const float &current,const float &update,const ScatterFunction func);
+template half reduce_op(const half &current,const half &update,const ScatterFunction func);
+}
+
+// NOTE: This function expects collapsed tensors as input.
+// Batch dims for update/indices tensors should be collapsed into a single dim.
+// Data dims should be collapsed into a single dim for both update and src tensors prior to calling this function.
+template <typename T>
+SimpleTensor<T> scatter_layer_internal(const SimpleTensor<T> &src, const SimpleTensor<T> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info)
+{
+    // 1. If zero initialization variable is false, copy src data to dst.
+    SimpleTensor<T> dst{ out_shape, src.data_type(), 1 };
+    if(!info.zero_initialization)
+    {
+        std::copy_n(src.data(), src.num_elements(), dst.data());
+    }
+
+    // Number of elements between each value of the dim being iterated through
+    const unsigned int data_stride = updates.shape().total_size_lower(updates.shape().num_dimensions() - 1);
+    const unsigned int no_output_dims = out_shape.num_dimensions();
+
+    // Calculate output stride at given index for all output dims.
+    std::vector<unsigned int> out_stride_at_idx(no_output_dims);
+    for (unsigned int i = 0 ; i < no_output_dims; i++)
+    {
+        out_stride_at_idx[i] = out_shape.total_size_lower(i);
+    }
+
+    const unsigned int indices_x_dim = static_cast<unsigned int>(indices.shape()[0]);
+    const unsigned int indices_y_dim = static_cast<unsigned int>(indices.shape()[1]);
+
+    // 2. Iterate over indices tensor y-dim and replace sections of dst tensor with relevant areas of update tensor.
+    for(unsigned int i = 0; i < indices_y_dim; i++)
+    {
+        // NOTE : Currently, indices.shape() == [X, Y, 1, 1], where  X is the indices dim and Y is the batch dim
+        // Starting index for both the update and indices tensors.
+        const unsigned int update_dim_start = i * data_stride;
+        const unsigned int indices_dim_start = i * indices_x_dim;
+        bool out_of_bounds = false;
+        unsigned int out_offset_acc = 0;
+
+        // Iterate over each indices value for the relevant batch and accumulate the offset.
+        for(unsigned int j = 0; j < indices_x_dim; j++)
+        {
+            // Get first index value with i * indices_x_dim (iterating through y-dim/batch idx), then iterate through x dim by adding k
+            const int index_value = indices[indices_dim_start + j];
+            const unsigned int out_dim = no_output_dims - (j+1);   // Calculate corresponding output dim to current index value.
+            if(index_value < static_cast<int>(out_shape[out_dim]) && index_value >= 0)
+            {
+                out_offset_acc += (index_value * out_stride_at_idx[out_dim]); // offset accumulation
+            }
+            else
+            {
+                out_of_bounds = true;
+                break;
+            }
+        }
+
+        // If not out of bounds, copy update tensor elements to output
+        if(!out_of_bounds)
+        {
+            for (unsigned int j = 0 ; j < data_stride; j++)
+            {
+                dst[out_offset_acc + j] = reduce_op(dst[out_offset_acc + j], updates[update_dim_start + j], info.func);
+            }
+        }
+    }
+    return dst;
+}
+
+template <typename T>
+SimpleTensor<T> scatter_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info)
+{
+    return scatter_layer_internal<T>(src, updates, indices, out_shape, info);
+}
+
+template SimpleTensor<float> scatter_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<half> scatter_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int32_t> scatter_layer(const SimpleTensor<int32_t> &src, const SimpleTensor<int32_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint32_t> scatter_layer(const SimpleTensor<uint32_t> &src, const SimpleTensor<uint32_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int16_t> scatter_layer(const SimpleTensor<int16_t> &src, const SimpleTensor<int16_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint16_t> scatter_layer(const SimpleTensor<uint16_t> &src, const SimpleTensor<uint16_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<int8_t> scatter_layer(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+template SimpleTensor<uint8_t> scatter_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ScatterLayer.h b/tests/validation/reference/ScatterLayer.h
new file mode 100644
index 0000000000..97d5e70b0d
--- /dev/null
+++ b/tests/validation/reference/ScatterLayer.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_SCATTERLAYER_H
+#define ACL_TESTS_VALIDATION_REFERENCE_SCATTERLAYER_H
+
+#include "Utils.h"
+#include "arm_compute/function_info/ScatterInfo.h"
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> scatter_layer_internal(const SimpleTensor<T> &src, const SimpleTensor<T> &update, const SimpleTensor<int32_t> &indices, const TensorShape &shape, const ScatterInfo &info);
+
+template <typename T>
+SimpleTensor<T> scatter_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &update, const SimpleTensor<int32_t> &indices, const TensorShape &shape, const ScatterInfo &info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_REFERENCE_SCATTERLAYER_H
diff --git a/tests/validation/reference/UtilsQuantizedAsymm.h b/tests/validation/reference/UtilsQuantizedAsymm.h
index 1f593bb696..e5ecc66545 100644
--- a/tests/validation/reference/UtilsQuantizedAsymm.h
+++ b/tests/validation/reference/UtilsQuantizedAsymm.h
@@ -32,6 +32,22 @@ namespace test
 {
 namespace validation
 {
+namespace
+{
+#if __clang__
+// This has been tested on clang 7.0.2 (__clang_major__ == 7 && __clang_minor__ == 0 && __clang_patchlevel__ == 2)
+inline int64_t to_int64(int32_t val)
+{
+    return static_cast<int64_t>(val) | ((val < 0) ? (((1ll << 32) - 1) << 32) : 0);
+}
+#else  // __clang__
+inline int64_t to_int64(int32_t val)
+{
+    return static_cast<int64_t>(val);
+}
+#endif // __clang__
+} // namespace
+
 /** Rounded to nearest division by a power-of-two. */
 inline int32_t asymm_rounding_divide_by_pow2(int32_t x, int exponent)
 {
@@ -43,12 +59,12 @@ inline int32_t asymm_rounding_divide_by_pow2(int32_t x, int exponent)
 /** Multiplication of two integers. The same as ARMv7 Arm® Neon™ VQRDMULH instruction. */
 inline int32_t asymm_int_mult(int32_t a, int32_t b)
 {
-    bool    overflow = a == b && a == std::numeric_limits<int32_t>::min();
-    int64_t a_64(a);
-    int64_t b_64(b);
-    int64_t ab_64        = a_64 * b_64;
-    int32_t nudge        = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
-    int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
+    const bool    overflow     = a == b && a == std::numeric_limits<int32_t>::min();
+    const int64_t a_64         = to_int64(a);
+    const int64_t b_64         = to_int64(b);
+    const int64_t ab_64        = a_64 * b_64;
+    const int32_t nudge        = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+    const int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
     return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
 }