From ed4b8a07e67c7802207c8954a88ad7a91aec79e0 Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Wed, 12 May 2021 12:28:58 +0100
Subject: Fix MeanStdDevNormalizationLayer reference outputting nan for FP16

- Bring the epsilon up to 1e-3 for FP16 (both backends) since it was causing the reference's variance being negative and its square root being NaN
- Bring the epsilon up to 1e-7 for FP16 NEON test for the same problem on the NEON kernel
- Adjust the CL kernel's vec_size when input tensor's width < 16 and use macros agnostic of vector size for sum reduction
- Add previously mismatching tensor shapes

Resolve COMPMID-4354

Change-Id: I823c871aacb72326f90c86b24cb16c3e2d4bd15e
Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5630
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../CL/cl_kernels/mean_stddev_normalization.cl     | 32 +++++++++-------------
 .../CL/kernels/CLMeanStdDevNormalizationKernel.cpp | 29 ++++++--------------
 tests/datasets/ShapeDatasets.h                     |  6 ++--
 .../validation/CL/MeanStdDevNormalizationLayer.cpp |  4 +--
 .../NEON/MeanStdDevNormalizationLayer.cpp          |  4 +--
 5 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/src/core/CL/cl_kernels/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/mean_stddev_normalization.cl
index 4141d3e8b7..76be629934 100644
--- a/src/core/CL/cl_kernels/mean_stddev_normalization.cl
+++ b/src/core/CL/cl_kernels/mean_stddev_normalization.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,31 +76,25 @@ __kernel void mean_stddev_normalization(
         sum_sq += data * data;
     }
     // Perform reduction
-#if VEC_SIZE > 8
-    sum.s01234567 += sum.s89abcdef;
-    sum_sq.s01234567 += sum_sq.s89abcdef;
-#endif // VEC_SIZE > 8
-#if VEC_SIZE > 4
-    sum.s0123 += sum.s4567;
-    sum_sq.s0123 += sum_sq.s4567;
-#endif // VEC_SIZE > 4
-#if VEC_SIZE > 2
-    sum.s01 += sum.s23;
-    sum_sq.s01 += sum_sq.s23;
-#endif // VEC_SIZE > 2
-    sum.s0 += sum.s1;
-    sum_sq.s0 += sum_sq.s1;
+    sum    = SUM_REDUCE(sum, VEC_SIZE);
+    sum_sq = SUM_REDUCE(sum_sq, VEC_SIZE);
+
+#if VEC_SIZE > 1
+#define sum sum.s0
+#define sum_sq sum_sq.s0
+#endif // VEC_SIZE > 1
+
     // Left-overs loop
     for(; i < WIDTH; ++i)
     {
         DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
 
-        sum.s0 += data;
-        sum_sq.s0 += data * data;
+        sum += data;
+        sum_sq += data * data;
     }
 
-    DATA_TYPE mean       = sum.s0 / WIDTH;
-    DATA_TYPE var        = (sum_sq.s0 / WIDTH) - (mean * mean);
+    DATA_TYPE mean       = sum / WIDTH;
+    DATA_TYPE var        = (sum_sq / WIDTH) - (mean * mean);
     DATA_TYPE stddev_inv = 1.f / sqrt(var + EPSILON);
 
     i = 0;
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index 7dc34f17b5..9f98b67582 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -54,22 +54,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     }
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    if(output != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output, *input);
-    }
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    // This kernel doesn't need padding
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    return std::make_pair(Status{}, win);
-}
 } // namespace
 
 CLMeanStdDevNormalizationKernel::CLMeanStdDevNormalizationKernel()
@@ -90,10 +74,15 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
 
     ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
 
+    if(output != nullptr)
+    {
+        auto_init_if_empty(*output->info(), *input->info());
+    }
+
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
@@ -107,9 +96,8 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
     _kernel = create_kernel(compile_context, "mean_stddev_normalization", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
 
     // Set config_id for enabling LWS tuning
     _config_id = "mean_stddev_normalization_layer_";
@@ -123,7 +111,6 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
 Status CLMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
     return Status{};
 }
 
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index c2b4cd7da4..37c5f1626d 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -88,9 +88,9 @@ public:
     Small2DShapes()
         : ShapeDataset("Shape",
     {
-        TensorShape{ 7U, 7U },
-                     TensorShape{ 27U, 13U },
-                     TensorShape{ 128U, 64U }
+        TensorShape{ 1U, 7U },
+                     TensorShape{ 5U, 13U },
+                     TensorShape{ 32U, 64U }
     })
     {
     }
diff --git a/tests/validation/CL/MeanStdDevNormalizationLayer.cpp b/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
index e77a21ed7f..cdeb622130 100644
--- a/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,7 +78,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLMeanStdDevNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F16)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-3 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
index 20e3bd5325..90d3d05a0d 100644
--- a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
@@ -81,7 +81,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F16)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -101,7 +101,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(),
                        framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("InPlace", { false, true })),
-                       framework::dataset::make("Epsilon", { 1e-8 })))
+                       framework::dataset::make("Epsilon", { 1e-7 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
-- 
cgit v1.2.1