From 8aaf93e8c12ce93d3d0082d4f4b70376f15536da Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Thu, 11 Oct 2018 17:33:32 +0100
Subject: COMPMID-1632 Add CLL2NormalizationLayer for NHWC and FP32

Change-Id: Iae22554d5fe893fd22a000eab5bfd8275ea06eb3
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/154102
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
---
 .../core/CL/kernels/CLReductionOperationKernel.h   |   4 +-
 .../runtime/CL/functions/CLL2NormalizeLayer.h      |   8 +-
 .../runtime/CL/functions/CLReductionOperation.h    |   8 +-
 src/core/CL/CLKernelLibrary.cpp                    |   3 +-
 src/core/CL/cl_kernels/l2_normalize.cl             |  52 +++++-
 src/core/CL/cl_kernels/reduction_operation.cl      |  21 ++-
 src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp   |  67 ++++++--
 src/core/CL/kernels/CLReductionOperationKernel.cpp |   3 +-
 src/runtime/CL/functions/CLReductionOperation.cpp  |   2 +-
 tests/validation/CL/L2NormalizeLayer.cpp           |  34 +++-
 tests/validation/CL/ReductionOperation.cpp         |   6 +-
 tests/validation/NEON/L2NormalizeLayer.cpp         |   8 +-
 .../validation/fixtures/L2NormalizeLayerFixture.h  |  38 ++++-
 tests/validation/reference/L2NormalizeLayer.cpp    |   5 +-
 tests/validation/reference/ReductionOperation.cpp  | 186 ++++++---------------
 15 files changed, 250 insertions(+), 195 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
index ef24fd5166..b5b90a15ce 100644
--- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
+++ b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
@@ -50,7 +50,7 @@ public:
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
      *                    Output will have the same number of dimensions as input.
      * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
@@ -61,7 +61,7 @@ public:
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
      * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
      *                   Output will have the same number of dimensions as input.
      * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
index d3d34f877b..86ccd3440f 100644
--- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
@@ -53,18 +53,18 @@ public:
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input   Source tensor. Data types supported: F32. Data layouts supported: NCHW.
+     * @param[in]  input   Source tensor. Data types supported: F32. Data layouts supported: NCHW/NCHW.
      * @param[out] output  Destination tensor. Data types and data layouts supported: Same as @p input.
-     * @param[in]  axis    Axis along which to reduce. Supported reduction axis : 0
+     * @param[in]  axis    Axis along which to reduce. Supported reduction axis : 0, 2
      * @param[in]  epsilon (Optional) Lower bound value for the normalization.
      */
     void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon = 1e-12);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayer.
      *
-     * @param[in] input   Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
+     * @param[in] input   Source tensor info. Data types supported: F32. Data layouts supported: NCHW/NCHW.
      * @param[in] output  Destination tensor info. Data types and data layouts supported: Same as @p input.
-     * @param[in] axis    Axis along which to reduce. Supported reduction axis : 0
+     * @param[in] axis    Axis along which to reduce. Supported reduction axis : 0, 2,
      * @param[in] epsilon (Optional) Lower bound value for the normalization.
      *
      * @return a status
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 42081786bc..ac35b7bfc6 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -53,18 +53,18 @@ public:
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
-     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
+     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
      * @param[in]  op     Reduction operation to perform.
      */
     void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation.
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
      * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
-     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
+     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
      * @param[in] op     Reduction operation to perform.
      *
      * @return a status
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index a2428ca99d..900cb04b1a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -296,7 +296,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "IYUV_to_RGB888_bt709", "color_convert.cl" },
     { "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
     { "IYUV_to_YUV444_bt709", "color_convert.cl" },
-    { "l2_normalize", "l2_normalize.cl" },
+    { "l2_normalize_nchw", "l2_normalize.cl" },
+    { "l2_normalize_nhwc", "l2_normalize.cl" },
     { "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
     { "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
     { "magnitude_phase", "magnitude_phase.cl" },
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
index f58e98bace..d230487030 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/l2_normalize.cl
@@ -23,7 +23,7 @@
  */
 #include "helpers.h"
 
-/** This kernel performs reduction given an operation.
+/** This kernel performs l2 normalization. (NCHW)
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
@@ -42,7 +42,7 @@
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  epsilon                           Epsilon value
  */
-__kernel void l2_normalize(
+__kernel void l2_normalize_nchw(
     VECTOR_DECLARATION(src),
     VECTOR_DECLARATION(sum),
     VECTOR_DECLARATION(dst),
@@ -55,7 +55,53 @@ __kernel void l2_normalize(
     VEC_DATA_TYPE(DATA_TYPE, 16)
     in = vload16(0, (__global DATA_TYPE *)src.ptr);
     VEC_DATA_TYPE(DATA_TYPE, 16)
-    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))native_rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+
+    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** This kernel performs l2 normalization. (NHWC)
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                           Epsilon value
+ */
+__kernel void l2_normalize_nhwc(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(dst),
+    DATA_TYPE epsilon)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, (__global DATA_TYPE *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
 
     vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
 }
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index c1be4472a7..d76e12ac04 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -189,7 +189,12 @@ __kernel void reduction_operation_y(
 
     for(unsigned int y = 0; y < HEIGHT; ++y)
     {
-        res += CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // SQRSUM
+        res += in;
     }
 
 #if defined(MEAN)
@@ -236,7 +241,12 @@ __kernel void reduction_operation_z(
 
     for(unsigned int z = 0; z < DEPTH; ++z)
     {
-        res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // SQRSUM
+        res += in;
     }
 
 #if defined(MEAN)
@@ -288,7 +298,12 @@ __kernel void reduction_operation_w(
 
     for(unsigned int w = 0; w < BATCH; ++w)
     {
-        res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // SQRSUM
+        res += in;
     }
 
 #if defined(MEAN)
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 54ed51eda2..cfd04ef392 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -49,9 +49,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
 
     // Reduce shape on axis
@@ -62,9 +61,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
     }
 
     return Status{};
@@ -110,11 +109,19 @@ void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor
     build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts));
+    const DataLayout data_layout = input->info()->data_layout();
+    _kernel                      = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize_" + lower_string(string_from_data_layout(data_layout)), build_opts));
 
     // Set epsilon argument
-    unsigned int idx = num_arguments_per_1D_tensor() * 3;
-    _kernel.setArg<cl_uint>(idx, _epsilon);
+    unsigned int idx = data_layout == DataLayout::NCHW ? num_arguments_per_1D_tensor() * 3 : num_arguments_per_2D_tensor() * 3;
+    if(input->info()->data_type() == DataType::F32)
+    {
+        _kernel.setArg<cl_uint>(idx, _epsilon);
+    }
+    else
+    {
+        _kernel.setArg<cl_ushort>(idx, _epsilon);
+    }
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(_input->info(), _output->info());
@@ -137,18 +144,42 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    Window in_slice  = window.first_slice_window_1D();
-    Window sum_slice = window_sum.first_slice_window_1D();
 
-    do
+    switch(_input->info()->data_layout())
     {
-        unsigned int idx = 0;
-        add_1D_tensor_argument(idx, _input, in_slice);
-        add_1D_tensor_argument(idx, _sum, sum_slice);
-        add_1D_tensor_argument(idx, _output, in_slice);
-        enqueue(queue, *this, in_slice);
+        case DataLayout::NCHW:
+        {
+            window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+            Window in_slice  = window.first_slice_window_1D();
+            Window sum_slice = window_sum.first_slice_window_1D();
+            do
+            {
+                unsigned int idx = 0;
+                add_1D_tensor_argument(idx, _input, in_slice);
+                add_1D_tensor_argument(idx, _sum, sum_slice);
+                add_1D_tensor_argument(idx, _output, in_slice);
+                enqueue(queue, *this, in_slice);
+            }
+            while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+        }
+        break;
+        case DataLayout::NHWC:
+        {
+            window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+            Window in_slice  = window.first_slice_window_2D();
+            Window sum_slice = window_sum.first_slice_window_2D();
+            do
+            {
+                unsigned int idx = 0;
+                add_2D_tensor_argument(idx, _input, in_slice);
+                add_2D_tensor_argument(idx, _sum, sum_slice);
+                add_2D_tensor_argument(idx, _output, in_slice);
+                enqueue(queue, *this, in_slice);
+            }
+            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
     }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
 }
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index d4165ccd4e..ef46325e4d 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -46,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && axis != 0, "Not supported reduction operation for this axis");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
     ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8);
@@ -142,6 +142,7 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     }
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
+    build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE=");
     build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
 
     switch(op)
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 4b65c47392..52a5d91cb8 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -119,7 +119,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
         for(unsigned int i = 0; i < _num_of_stages - 1; i++)
         {
             shape.set(0, ceil(shape.x() / 128.f));
-            _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+            _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
         }
 
         // Apply ReductionOperation only on first kernel
diff --git a/tests/validation/CL/L2NormalizeLayer.cpp b/tests/validation/CL/L2NormalizeLayer.cpp
index 3d121b079d..517ba84069 100644
--- a/tests/validation/CL/L2NormalizeLayer.cpp
+++ b/tests/validation/CL/L2NormalizeLayer.cpp
@@ -44,6 +44,10 @@ namespace
 {
 /** Tolerance for float operations */
 constexpr AbsoluteTolerance<float> tolerance_f32(0.00001f);
+constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);
+
+auto data = concat(combine(framework::dataset::make("DataLayout", { DataLayout::NCHW }), framework::dataset::make("Axis", { 0 })), combine(framework::dataset::make("DataLayout", { DataLayout::NHWC }),
+                   framework::dataset::make("Axis", { 1 })));
 
 } // namespace
 
@@ -58,7 +62,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                              TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F32
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
-                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 3
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
                                            }),
     framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(128U, 64U), 1, DataType::F16),
@@ -69,7 +73,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
                                              TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
                                            })),
-    framework::dataset::make("Axis",       { 0U, 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 1U, 0U })),
+    framework::dataset::make("Axis",       { 0U, 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 4U, 0U })),
     framework::dataset::make("Expected",   { false, false, false, false, false, false, true })),
     input_info, output_info, axis, expected)
 {
@@ -87,22 +91,36 @@ using CLL2NormalizeLayerFixture = L2NormalizeLayerValidationFixture<CLTensor, CL
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLL2NormalizeLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Epsilon", { 1e-12 })))
+                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), data), framework::dataset::make("Epsilon", { 1e-12 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLL2NormalizeLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Epsilon", { 1e-12 })))
+                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), data), framework::dataset::make("Epsilon", { 1e-12 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLL2NormalizeLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), data), framework::dataset::make("Epsilon", { 1e-6 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLL2NormalizeLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), data), framework::dataset::make("Epsilon", { 1e-6 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // L2NormalizeLayer
+TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp
index 516a1341cc..2adb4e90d6 100644
--- a/tests/validation/CL/ReductionOperation.cpp
+++ b/tests/validation/CL/ReductionOperation.cpp
@@ -58,16 +58,16 @@ TEST_SUITE(ReductionOperation)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     framework::dataset::make("InputInfo",          { TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching data type input/output
                                                      TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1
-                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F16/F32
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != QASYMM8/F16/F32
                                                      TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
-                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0 and SUM_SQUARE
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::QASYMM8), // Axis == 0 and SUM_SQUARE and QASYMM8
                                                      TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
                                                    }),
     framework::dataset::make("OutputInfo",         { TensorInfo(TensorShape(1U, 64U), 1, DataType::F16),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::S16),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
-                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::QASYMM8),
                                                      TensorInfo(TensorShape(1U, 64U), 1, DataType::F32)
                                                    })),
     framework::dataset::make("Axis",               { 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 1U, 0U })),
diff --git a/tests/validation/NEON/L2NormalizeLayer.cpp b/tests/validation/NEON/L2NormalizeLayer.cpp
index f868adea3b..0a1ddba77c 100644
--- a/tests/validation/NEON/L2NormalizeLayer.cpp
+++ b/tests/validation/NEON/L2NormalizeLayer.cpp
@@ -85,14 +85,18 @@ using NEL2NormalizeLayerFixture = L2NormalizeLayerValidationFixture<Tensor, Acce
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Epsilon", { 1e-12 })))
+                       combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                       framework::dataset::make("Axis", { 0 })),
+                               framework::dataset::make("Epsilon", { 1e-12 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEL2NormalizeLayerFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Epsilon", { 1e-12 })))
+                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                       framework::dataset::make("Axis", { 0 })),
+                               framework::dataset::make("Epsilon", { 1e-12 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/fixtures/L2NormalizeLayerFixture.h b/tests/validation/fixtures/L2NormalizeLayerFixture.h
index 6f11dcb658..097d1c4ec2 100644
--- a/tests/validation/fixtures/L2NormalizeLayerFixture.h
+++ b/tests/validation/fixtures/L2NormalizeLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,10 +45,10 @@ class L2NormalizeLayerValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, unsigned int axis, float epsilon)
+    void setup(TensorShape shape, DataType data_type, DataLayout data_layout, unsigned int axis, float epsilon)
     {
-        _target    = compute_target(shape, data_type, axis, epsilon);
-        _reference = compute_reference(shape, data_type, axis, epsilon);
+        _target    = compute_target(shape, data_type, data_layout, axis, epsilon);
+        _reference = compute_reference(shape, data_type, data_layout, axis, epsilon);
     }
 
 protected:
@@ -58,11 +58,16 @@ protected:
         library->fill_tensor_uniform(tensor, 0);
     }
 
-    TensorType compute_target(const TensorShape &shape, DataType data_type, unsigned int axis, float epsilon)
+    TensorType compute_target(TensorShape shape, DataType data_type, DataLayout data_layout, unsigned int axis, float epsilon)
     {
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(shape, PermutationVector(2U, 0U, 1U));
+        }
+
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type);
-        TensorType dst = create_tensor<TensorType>(shape, data_type);
+        TensorType src = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), data_layout);
+        TensorType dst = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), data_layout);
 
         // Create and configure function
         FunctionType l2_norm_func;
@@ -87,8 +92,25 @@ protected:
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, unsigned int axis, float epsilon)
+    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, DataLayout data_layout, unsigned int axis, float epsilon)
     {
+        if(data_layout == DataLayout::NHWC)
+        {
+            switch(axis)
+            {
+                case 0:
+                    axis = 2;
+                    break;
+                case 1:
+                    axis = 0;
+                    break;
+                case 2:
+                    axis = 1;
+                    break;
+                default:
+                    break;
+            }
+        }
         // Create reference
         SimpleTensor<T> src{ shape, data_type };
 
diff --git a/tests/validation/reference/L2NormalizeLayer.cpp b/tests/validation/reference/L2NormalizeLayer.cpp
index 99f4e8a6e6..26677511e4 100644
--- a/tests/validation/reference/L2NormalizeLayer.cpp
+++ b/tests/validation/reference/L2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@ SimpleTensor<T> l2_normalize(const SimpleTensor<T> &src, unsigned int axis, floa
         {
             const T *src_row_ptr         = src.data() + du * elems;
             T       *dst_row_ptr         = dst.data() + du * elems;
-            const T  normalization_value = std::sqrt(std::max(sum[du], epsilon));
+            const T  normalization_value = sqrt(std::max(sum[du], static_cast<T>(epsilon)));
             std::transform(src_row_ptr, src_row_ptr + elems, dst_row_ptr, [normalization_value](T val)
             {
                 return val / normalization_value;
@@ -82,6 +82,7 @@ SimpleTensor<T> l2_normalize(const SimpleTensor<T> &src, unsigned int axis, floa
 }
 
 template SimpleTensor<float> l2_normalize(const SimpleTensor<float> &src, unsigned int axis, float epsilon);
+template SimpleTensor<half> l2_normalize(const SimpleTensor<half> &src, unsigned int axis, float epsilon);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index 499263f11e..2f103a6f65 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -39,36 +39,39 @@ namespace reference
 namespace
 {
 template <typename T>
-struct square
+T reduce_operation(T *ptr, int reduce_elements, ReductionOperation op, int stride)
 {
-    T operator()(const T &lhs, const T &rhs) const
-    {
-        return (lhs + rhs * rhs);
-    }
-};
+    using type = typename std::remove_cv<T>::type;
+    auto res   = type(0);
 
-template <typename T>
-struct sum
-{
-    T operator()(const T &lhs, const T &rhs) const
+    if(std::is_integral<type>::value)
     {
-        return (lhs + rhs);
+        uint32_t int_res = 0;
+        for(int i = 0; i < reduce_elements; ++i)
+        {
+            auto elem = static_cast<uint32_t>(*(ptr + stride * i));
+            int_res += (op == ReductionOperation::SUM_SQUARE) ? elem * elem : elem;
+        }
+        if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0)
+        {
+            int_res /= reduce_elements;
+        }
+        res = saturate_cast<type>(int_res);
     }
-};
-
-template <typename T>
-T reduce_operation(T *ptr, int reduce_elements, ReductionOperation op)
-{
-    switch(op)
+    else
     {
-        case ReductionOperation::SUM_SQUARE:
-            return std::accumulate(ptr, ptr + reduce_elements, static_cast<T>(0), square<T>());
-        case ReductionOperation::SUM:
-        case ReductionOperation::MEAN_SUM:
-            return std::accumulate(ptr, ptr + reduce_elements, static_cast<T>(0), sum<T>());
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction operation");
+        for(int i = 0; i < reduce_elements; ++i)
+        {
+            auto elem = *(ptr + stride * i);
+            res += (op == ReductionOperation::SUM_SQUARE) ? elem * elem : elem;
+        }
+        if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0)
+        {
+            res /= reduce_elements;
+        }
     }
+
+    return res;
 }
 } // namespace
 
@@ -77,44 +80,22 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
 {
     // Create reference
     SimpleTensor<T>    dst{ dst_shape, src.data_type(), 1, src.quantization_info() };
-    const unsigned int src_width  = src.shape().x();
-    const unsigned int src_height = src.shape().y();
-    const unsigned int src_depth  = src.shape().z();
-    const unsigned int src_batch  = src.shape()[3];
-    const bool         mean       = op == ReductionOperation::MEAN_SUM;
+    const unsigned int src_width    = src.shape().x();
+    const unsigned int src_height   = src.shape().y();
+    const unsigned int src_depth    = src.shape().z();
+    const unsigned int src_batch    = src.shape()[3];
+    const int          reduce_elems = src.shape()[axis];
 
     switch(axis)
     {
         case 0:
         {
-            const int          reduce_elems = src.shape()[axis];
-            const unsigned int upper_dims   = src.shape().total_size_upper(1);
+            const unsigned int upper_dims = src.shape().total_size_upper(1);
             for(unsigned int du = 0; du < upper_dims; ++du)
             {
-                if(std::is_integral<T>::value)
-                {
-                    uint32_t res = 0;
-                    for(unsigned int x = 0; x < src_width; ++x)
-                    {
-                        res += static_cast<uint32_t>(src[du * src_width + x]);
-                    }
-                    if(mean && src_width > 0)
-                    {
-                        res /= src_width;
-                    }
-                    dst[du] = saturate_cast<uint8_t>(res);
-                }
-                else
-                {
-                    const T *src_row_ptr = src.data() + du * reduce_elems;
-
-                    auto res = reduce_operation(src_row_ptr, reduce_elems, op);
-                    if(mean && src_width > 0)
-                    {
-                        res /= src_width;
-                    }
-                    dst[du] = res;
-                }
+                const T *src_row_ptr = src.data() + du * reduce_elems;
+                auto     res         = reduce_operation(src_row_ptr, reduce_elems, op, 1);
+                dst[du]              = res;
             }
         }
         break;
@@ -125,32 +106,11 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
             {
                 for(unsigned int x = 0; x < src_width; ++x)
                 {
-                    if(std::is_integral<T>::value)
-                    {
-                        uint32_t res = 0;
-                        for(unsigned int y = 0; y < src_height; ++y)
-                        {
-                            res += static_cast<uint32_t>(src[du * src_height * src_width + y * src_width + x]);
-                        }
-                        if(mean && src_height > 0)
-                        {
-                            res /= src_height;
-                        }
-                        dst[du * src_width + x] = saturate_cast<uint8_t>(res);
-                    }
-                    else
-                    {
-                        auto res = T(0);
-                        for(unsigned int y = 0; y < src_height; ++y)
-                        {
-                            res += src[du * src_height * src_width + y * src_width + x];
-                        }
-                        if(mean && src_height > 0)
-                        {
-                            res /= src_height;
-                        }
-                        dst[du * src_width + x] = res;
-                    }
+                    const int in_offset   = du * src_height * src_width + x;
+                    const int out_offset  = du * src_width + x;
+                    const T *src_row_ptr = src.data() + in_offset;
+                    auto      res         = reduce_operation(src_row_ptr, reduce_elems, op, src_width);
+                    dst[out_offset]       = res;
                 }
             }
         }
@@ -164,32 +124,11 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
                 {
                     for(unsigned int y = 0; y < src_height; ++y)
                     {
-                        if(std::is_integral<T>::value)
-                        {
-                            uint32_t res = T(0);
-                            for(unsigned int z = 0; z < src_depth; ++z)
-                            {
-                                res += static_cast<uint32_t>(src[du * src_depth * src_height * src_width + z * src_height * src_width + y * src_width + x]);
-                            }
-                            if(mean && src_depth > 0)
-                            {
-                                res /= src_depth;
-                            }
-                            dst[du * src_width * src_height + y * src_width + x] = saturate_cast<uint8_t>(res);
-                        }
-                        else
-                        {
-                            auto res = T(0);
-                            for(unsigned int z = 0; z < src_depth; ++z)
-                            {
-                                res += src[du * src_depth * src_height * src_width + z * src_height * src_width + y * src_width + x];
-                            }
-                            if(mean && src_depth > 0)
-                            {
-                                res /= src_depth;
-                            }
-                            dst[du * src_width * src_height + y * src_width + x] = res;
-                        }
+                        const int in_offset   = du * src_depth * src_height * src_width + y * src_width + x;
+                        const int out_offset  = du * src_width * src_height + y * src_width + x;
+                        const T *src_row_ptr = src.data() + in_offset;
+                        auto      res         = reduce_operation(src_row_ptr, reduce_elems, op, src_height * src_width);
+                        dst[out_offset]       = res;
                     }
                 }
             }
@@ -206,34 +145,11 @@ SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShap
                     {
                         for(unsigned int x = 0; x < src_width; ++x)
                         {
-                            if(std::is_integral<T>::value)
-                            {
-                                uint32_t res = 0;
-                                for(unsigned int w = 0; w < src_batch; ++w)
-                                {
-                                    res += static_cast<uint32_t>(src[du * src_batch * src_depth * src_height * src_width + w * src_width * src_height * src_depth + z * src_width * src_height + y * src_width + x]);
-                                }
-                                if(mean && src_batch > 0)
-                                {
-                                    res /= src_batch;
-                                }
-
-                                dst[du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x] = saturate_cast<uint8_t>(res);
-                            }
-                            else
-                            {
-                                auto res = T(0);
-                                for(unsigned int w = 0; w < src_batch; ++w)
-                                {
-                                    res += src[du * src_batch * src_depth * src_height * src_width + w * src_width * src_height * src_depth + z * src_width * src_height + y * src_width + x];
-                                }
-                                if(mean && src_batch > 0)
-                                {
-                                    res /= src_batch;
-                                }
-
-                                dst[du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x] = res;
-                            }
+                            const int in_offset   = du * src_batch * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x;
+                            const int out_offset  = du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x;
+                            const T *src_row_ptr = src.data() + in_offset;
+                            auto      res         = reduce_operation(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
+                            dst[out_offset]       = res;
                         }
                     }
                 }
-- 
cgit v1.2.1