From 2897e61e8fe04aaf95540f4525c3dd3f7f46ebfa Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Tue, 20 Nov 2018 18:38:29 +0000
Subject: COMPMID-1645  NEL2Normalization for FP32/FP16 & NHWC

Change-Id: I29e35024e29781a6b943b568abec9c73649215e6
---
 .../core/NEON/kernels/NEL2NormalizeLayerKernel.h   |   8 +-
 .../core/NEON/wrapper/intrinsics/intrinsics.h      |   1 +
 arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h |  53 ++++++++
 .../runtime/NEON/functions/NEL2NormalizeLayer.h    |   8 +-
 src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp   |   2 +-
 src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp | 139 +++++++++++++++++++--
 .../NEON/kernels/NEReductionOperationKernel.cpp    |  12 +-
 src/runtime/CL/functions/CLL2NormalizeLayer.cpp    |   5 +-
 src/runtime/NEON/functions/NEL2NormalizeLayer.cpp  |   9 +-
 .../NEON/functions/NEReductionOperation.cpp        |   2 +-
 tests/validation/CL/L2NormalizeLayer.cpp           |   4 +-
 tests/validation/NEON/L2NormalizeLayer.cpp         |  43 +++++--
 12 files changed, 243 insertions(+), 43 deletions(-)
 create mode 100644 arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h

diff --git a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index 0de07fdab7..f893c4ae6b 100644
--- a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
@@ -52,24 +52,24 @@ public:
     ~NEL2NormalizeLayerKernel() = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input   Source tensor. Data types supported: F32. Data layouts supported: NCHW.
+     * @param[in]  input   Source tensor. Data types supported: F16/F32.
      * @param[in]  sum     Sum values tensor. Data types supported: same as @p input.
      *                     Sum will have the same number of dimensions as input.
      * @param[out] output  Destination tensor. Data types and data layouts supported: same as @p input.
      *                     Output will have the same number of dimensions as input.
-     * @param[in]  axis    Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in]  axis    Dimension along which to reduce. Supported reduction axis : 0, 1, 2
      * @param[in]  epsilon Lower bound value for the normalization.
      */
     void configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEL2NormalizeLayerKernel.
      *
-     * @param[in] input   Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
+     * @param[in] input   Source tensor info. Data types supported: F16/F32.
      * @param[in] sum     Sum values tensor info. Data types supported: same as @p input.
      *                    Sum will have the same number of dimensions as input.
      * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
      *                    Output will have the same number of dimensions as input.
-     * @param[in] axis    Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in] axis    Dimension along which to reduce. Supported reduction axis : 0, 1, 2
      * @param[in] epsilon Lower bound value for the normalization.
      *
      * @return a status
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
index 2e6fd75005..7ea0aba565 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -32,6 +32,7 @@
 #include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/inv.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/load.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/max.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/min.h"
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h b/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
new file mode 100644
index 0000000000..0bbf49b5c0
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_INVSQRT_H__
+#define __ARM_COMPUTE_WRAPPER_INVSQRT_H__
+
+#include "arm_compute/core/NEON/NEMath.h"
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VINVSQRT_IMPL(stype, vtype, prefix, postfix) \
+    inline vtype vinvsqrt(const vtype &a)            \
+    {                                                \
+        return prefix##_##postfix(a);                \
+    }
+
+VINVSQRT_IMPL(float, float32x2_t, vinvsqrt, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VINVSQRT_IMPL(float16_t, float16x4_t, vinvsqrt, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VINVSQRT_IMPL(float, float32x4_t, vinvsqrtq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VINVSQRT_IMPL(float16_t, float16x8_t, vinvsqrtq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VINVSQRT_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_INVSQRT_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index c089856fcd..ba506fa9ab 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -50,18 +50,18 @@ public:
     NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in, out] input   Source tensor. Data types supported: F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[in, out] input   Source tensor. Data types supported: F16/F32. (Written to only for border_size != 0)
      * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      axis    Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in]      axis    Dimension along which to reduce. Supported reduction axis : 0, 1, 2
      * @param[in]      epsilon (Optional) Lower bound value for the normalization.
      */
     void configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon = 1e-12f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEL2NormalizeLayer.
      *
-     * @param[in] input   Source tensor info. Data types supported: F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[in] input   Source tensor info. Data types supported: F16/F32. (Written to only for border_size != 0)
      * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
-     * @param[in] axis    Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in] axis    Dimension along which to reduce. Supported reduction axis : 0, 1, 2
      * @param[in] epsilon (Optional) Lower bound value for the normalization.
      *
      * @return a status
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 97dd919d08..4f36046b28 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -50,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 2, "Axis greater than 2 is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
 
     // Reduce shape on axis
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index ed037832af..cda041de66 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -32,15 +32,20 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 #include <cmath>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
+template <typename T, int S>
 void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
 {
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
     Window window_sum(window);
     window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
 
@@ -53,30 +58,97 @@ void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float e
         Iterator sum_it(sum, sum_slice);
         Iterator output_it(out, in_slice);
 
-        const float       sum_value           = *reinterpret_cast<const float *>(sum_it.ptr());
-        const float32x4_t vec_normalize_value = vdupq_n_f32(1.f / std::sqrt(std::max(sum_value, epsilon)));
+        const auto sum_value           = *reinterpret_cast<const T *>(sum_it.ptr());
+        const auto vec_normalize_value = wrapper::vdup_n(static_cast<T>(1.f / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)))), ExactTagType{});
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            const auto in_ptr  = reinterpret_cast<const float *>(input_it.ptr());
-            const auto out_ptr = reinterpret_cast<float *>(output_it.ptr());
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
 
-            vst1q_f32(out_ptr, vmulq_f32(vld1q_f32(in_ptr), vec_normalize_value));
+            wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
         },
         input_it, output_it);
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
 }
 
+template <typename T, int S>
+void l2_normalize_Y(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    Window window_sum(window);
+    window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window in_slice  = window.first_slice_window_2D();
+    Window sum_slice = window_sum.first_slice_window_2D();
+
+    do
+    {
+        Iterator input_it(in, in_slice);
+        Iterator sum_it(sum, sum_slice);
+        Iterator output_it(out, in_slice);
+
+        auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
+            wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
+        },
+        input_it, sum_it, output_it);
+    }
+    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+}
+
+template <typename T, int S>
+void l2_normalize_Z(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    Window window_sum(window);
+    window_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window sum_slice = window_sum.first_slice_window_3D();
+
+    do
+    {
+        Iterator input_it(in, in_slice);
+        Iterator sum_it(sum, sum_slice);
+        Iterator output_it(out, in_slice);
+
+        auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
+            wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
+        },
+        input_it, sum_it, output_it);
+    }
+    while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 2, "Axis greater than 2 is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
@@ -89,7 +161,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
     }
 
     return Status{};
@@ -158,9 +230,52 @@ void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
     switch(_axis)
     {
         case 0:
-            l2_normalize_X(_input, _sum, _output, _epsilon, window);
+            switch(_input->info()->data_type())
+            {
+                case DataType::F32:
+                    l2_normalize_X<float, 4>(_input, _sum, _output, _epsilon, window);
+                    break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    l2_normalize_X<float16_t, 8>(_input, _sum, _output, _epsilon, window);
+                    break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                default:
+                    ARM_COMPUTE_ERROR("Not implemented");
+            }
+            break;
+        case 1:
+            switch(_input->info()->data_type())
+            {
+                case DataType::F32:
+                    l2_normalize_Y<float, 4>(_input, _sum, _output, _epsilon, window);
+                    break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    l2_normalize_Y<float16_t, 8>(_input, _sum, _output, _epsilon, window);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not implemented");
+            }
+            break;
+        case 2:
+            switch(_input->info()->data_type())
+            {
+                case DataType::F32:
+                    l2_normalize_Z<float, 4>(_input, _sum, _output, _epsilon, window);
+                    break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    l2_normalize_Z<float16_t, 8>(_input, _sum, _output, _epsilon, window);
+                    break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                default:
+                    ARM_COMPUTE_ERROR("Not implemented");
+            }
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported normalization axis");
     }
 }
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 182e93d177..9306e0303d 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -66,12 +66,14 @@ public:
     {
         // Set in window
         Window in_window(window);
+        Window out_window(window);
 
         in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+        out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
 
         // Get first input and output slices
         Window in_slice  = in_window.first_slice_window_2D();
-        Window out_slice = window.first_slice_window_2D();
+        Window out_slice = out_window.first_slice_window_2D();
 
         do
         {
@@ -80,18 +82,20 @@ public:
 
             f(in, out, in_slice, out_slice, *input->info(), 1);
         }
-        while(in_window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+        while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
     }
     static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f)
     {
         // Set in window
         Window in_window(window);
+        Window out_window(window);
 
         in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
 
         // Get first input and output slices
         Window in_slice  = in_window.first_slice_window_3D();
-        Window out_slice = window.first_slice_window_3D();
+        Window out_slice = out_window.first_slice_window_3D();
 
         do
         {
@@ -100,7 +104,7 @@ public:
 
             f(in, out, in_slice, out_slice, *input->info(), 2);
         }
-        while(in_window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+        while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
     }
     static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f)
     {
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 4f709d561d..2e3c6d7763 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -32,8 +32,8 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
@@ -81,3 +81,4 @@ void CLL2NormalizeLayer::run()
 
     _memory_group.release();
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index d0b80fb1b8..56da966abf 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
@@ -57,8 +57,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
 
-    // Reduce shape on axis (supported axis is 0)
-    shape.set(0, 1);
+    // Reduce shape on axis
+    shape.set(axis, 1);
     sum_sq.set_tensor_shape(shape);
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
@@ -75,3 +75,4 @@ void NEL2NormalizeLayer::run()
 
     _memory_group.release();
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 188c2bbb18..bb27b5d47a 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -86,7 +86,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
     if(axis == 0)
     {
         // Configure fill border kernel
-        BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
+        BorderSize fill_border_size = _reduction_kernel.border_size();
         BorderMode fill_border_mode = reduction_operation_border_mode(op);
         _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
     }
diff --git a/tests/validation/CL/L2NormalizeLayer.cpp b/tests/validation/CL/L2NormalizeLayer.cpp
index 13a33a864c..fdbfa3ed4d 100644
--- a/tests/validation/CL/L2NormalizeLayer.cpp
+++ b/tests/validation/CL/L2NormalizeLayer.cpp
@@ -62,7 +62,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                              TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F32
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
-                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 3
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 2
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
                                            }),
     framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(128U, 64U), 1, DataType::F16),
@@ -73,7 +73,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
                                            })),
-    framework::dataset::make("Axis",       { 0U, 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 4U, 0U })),
+    framework::dataset::make("Axis",       { 0U, 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 3U, 0U })),
     framework::dataset::make("Expected",   { false, false, false, false, false, false, true })),
     input_info, output_info, axis, expected)
 {
diff --git a/tests/validation/NEON/L2NormalizeLayer.cpp b/tests/validation/NEON/L2NormalizeLayer.cpp
index 0a1ddba77c..3164a65417 100644
--- a/tests/validation/NEON/L2NormalizeLayer.cpp
+++ b/tests/validation/NEON/L2NormalizeLayer.cpp
@@ -44,6 +44,9 @@ namespace
 {
 /** Tolerance for float operations */
 RelativeTolerance<float> tolerance_f32(0.00001f);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+RelativeTolerance<float> tolerance_f16(0.2f);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 } // namespace
 
 TEST_SUITE(NEON)
@@ -57,7 +60,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                              TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F32
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
-                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 2
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
                                            }),
     framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(128U, 64U), 1, DataType::F16),
@@ -68,7 +71,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
                                            })),
-    framework::dataset::make("Axis",       { 0U, 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 1U, 0U })),
+    framework::dataset::make("Axis",       { 0U, 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 3U, 0U })),
     framework::dataset::make("Expected",   { false, false, false, false, false, false, true })),
     input_info, output_info, axis, expected)
 {
@@ -85,8 +88,8 @@ using NEL2NormalizeLayerFixture = L2NormalizeLayerValidationFixture<Tensor, Acce
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("Axis", { 0 })),
+                       combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("Axis", { 0, 1, 2 })),
                                framework::dataset::make("Epsilon", { 1e-12 })))
 {
     // Validate output
@@ -94,17 +97,39 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<float>, framework::Da
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEL2NormalizeLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-                                       framework::dataset::make("Axis", { 0 })),
+                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("Axis", { 0, 1, 2 })),
                                framework::dataset::make("Epsilon", { 1e-12 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("Axis", { 0, 1, 2 })),
+                               framework::dataset::make("Epsilon", { 1e-12 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEL2NormalizeLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("Axis", { 0, 1, 2 })),
+                               framework::dataset::make("Epsilon", { 1e-12 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+#endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+TEST_SUITE_END() // L2NormalizeLayer
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-- 
cgit v1.2.1