From 9d3a831d4131f8a8b37f127f11d36848d33e8496 Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Tue, 20 Nov 2018 12:31:24 +0000
Subject: COMPMID-1648: CLNormalizationLayer IN_MAP_2D support for NHWC for
 FP32/FP16

Change-Id: I49f1d865f5e7562f1d80db849353a89ef77e6a9e
---
 .../core/CL/kernels/CLNormalizationLayerKernel.h   |  6 +-
 arm_compute/core/Types.h                           |  7 +-
 .../runtime/CL/functions/CLNormalizationLayer.h    |  7 +-
 src/core/CL/CLKernelLibrary.cpp                    |  3 +-
 src/core/CL/cl_kernels/normalization_layer.cl      | 88 +++++++++++++++++++++-
 src/core/CL/kernels/CLNormalizationLayerKernel.cpp | 32 +++++---
 tests/validation/CL/NormalizationLayer.cpp         | 27 ++++---
 .../validation/GLES_COMPUTE/NormalizationLayer.cpp | 10 ++-
 tests/validation/NEON/NormalizationLayer.cpp       | 18 +++--
 .../fixtures/NormalizationLayerFixture.h           | 19 +++--
 tests/validation/reference/NormalizationLayer.cpp  |  2 +-
 11 files changed, 173 insertions(+), 46 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
index beeb8b838e..498fc11665 100644
--- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -48,16 +48,18 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                       and an optional 4th dimension for batch of inputs. Data types supported: F16/F32.
+     *                       and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
      * @param[out] output    Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
+     *                       Data layouts supported: same as @p input.
      * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      */
     void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                      and an optional 4th dimension for batch of inputs. Data types supported: F16/F32.
+     *                      and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
      * @param[in] output    Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
+     *                      Data layouts supported: same as @p input.
      * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      *
      * @return a status
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 0f2786cd12..9f3857c6cd 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1345,7 +1345,7 @@ class NormalizationLayerInfo
 public:
     /** Default Constructor
      *
-     * @param[in] type      The normalization type. Can be @ref NormType::IN_MAP_1D, @ref NormType::IN_MAP_2D or @ref NORM_TYPE::CROSS_MAP
+     * @param[in] type      The normalization type. Can be @ref NormType::IN_MAP_1D, @ref NormType::IN_MAP_2D or @ref NormType::CROSS_MAP
      * @param[in] norm_size The normalization size is the number of elements to normalize across. Defaults to 5.
      * @param[in] alpha     (Optional) Alpha parameter used by normalization equation. Defaults to 0.0001.
      * @param[in] beta      (Optional) Beta parameter used by normalization equation. Defaults to 0.5.
@@ -1382,6 +1382,11 @@ public:
     {
         return _kappa;
     }
+    /** Get the is_scaled value */
+    bool is_scaled() const
+    {
+        return _is_scaled;
+    }
     /** Check if normalization is cross map */
     bool is_cross_map() const
     {
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index 89e20d20f6..1ed87fde27 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -51,16 +51,19 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in, out] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: F16/F32 (Written to by the border handler)
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: F16/F32 (Written to by the border handler).
+     *                           Data layouts supported: NCHW/NHWC.
      * @param[out]     output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
+     *                           Data types supported: same as @p input. Data layouts supported: same as @p input.
      * @param[in]      norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      */
     void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayer
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                      and an optional 4th dimension for batch of inputs. Data types supported: F16/F32
+     *                      and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
      * @param[in] output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
+     *                      Data layouts supported: same as @p input.
      * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      *
      * @return a status
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 33e66705e3..3a002e808d 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -326,7 +326,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
     { "non_max_suppression", "nonmax.cl" },
     { "normalization_layer_cross_map", "normalization_layer.cl" },
-    { "normalization_layer_in_map", "normalization_layer.cl" },
+    { "normalization_layer_in_map_nchw", "normalization_layer.cl" },
+    { "normalization_layer_in_map_nhwc", "normalization_layer.cl" },
     { "normalize_planar_yuv_layer_nchw", "normalize_planar_yuv_layer.cl" },
     { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
     { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index 0b6df39c9a..390f8fcbeb 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -32,6 +32,7 @@
 #define LOAD_OP(offset, ptr) vload4(offset, ptr)
 #define STORE_OP(data, offset, ptr) vstore4(data, offset, ptr)
 
+#if defined(NUM_SLICES)
 /** Apply cross-map normalization.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
@@ -91,9 +92,10 @@ __kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
 
     STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
+#endif /* defined(NUM_SLICES) */
 
 #if defined(WIDTH_SIZE)
-/** Apply in-map normalization.
+/** Apply in-map normalization when tensors are in the NCHW data layout format.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
@@ -117,8 +119,8 @@ __kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void normalization_layer_in_map(TENSOR3D_DECLARATION(input),
-                                         TENSOR3D_DECLARATION(output))
+__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
 {
     Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
@@ -170,3 +172,83 @@ __kernel void normalization_layer_in_map(TENSOR3D_DECLARATION(input),
     STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
 #endif // defined(WIDTH_SIZE)
+
+#if defined(NUM_SLICES)
+/** Apply in-map normalization when tensors are in the NHWC data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+    const int current_cols = get_global_id(1);
+    const int first_col    = max(-(int)RADIUS, -current_cols);
+    const int last_col     = min((int)RADIUS, (int)get_global_size(1) - 1 - current_cols);
+
+#if defined(IN_MAP_2D)
+    const int current_rows = get_global_id(2);
+    const int first_row    = max(-(int)RADIUS, -current_rows);
+    const int last_row     = min((int)RADIUS, (int)NUM_SLICES - 1 - current_rows);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#endif /* defined(IN_MAP_2D) */
+        for(int i = first_col; i <= last_col; ++i)
+        {
+#if defined(IN_MAP_2D)
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, j));
+#else  /* defined(IN_MAP_2D) */
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, i, 0));
+#endif /* defined(IN_MAP_2D) */
+            acc = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(LOAD_OP(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 67357da7d1..9623ec6a89 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -37,20 +37,21 @@ using namespace arm_compute;
 
 namespace
 {
+constexpr unsigned int num_elems_processed_per_iteration = 4;
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC && norm_info.type() == NormType::IN_MAP_2D,
-                                    "Only Cross-map and 1D In-map normalization is supported for NHWC layout");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
 
@@ -62,8 +63,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
 
-    const unsigned int num_elems_processed_per_iteration = 4;
-
     const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
     const bool         is_norm_accross_width = norm_idx == 0;
 
@@ -118,15 +117,14 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = 4;
-    const bool         is_in_map_2D                      = (norm_info.type() == NormType::IN_MAP_2D);
-
     const DataLayout   data_layout  = input->info()->data_layout();
     const unsigned int norm_idx     = get_normalization_dimension_index(data_layout, norm_info);
     _is_norm_across_width           = norm_idx == 0;
     const unsigned int border_width = _is_norm_across_width ? num_elems_processed_per_iteration - 1 : 0;
     _border_size                    = BorderSize(0, border_width);
 
+    const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
+
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
@@ -140,8 +138,24 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
     build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
 
     // Create kernel
-    std::string kernel_name = _is_norm_across_width ? "normalization_layer_in_map" : "normalization_layer_cross_map";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+    std::string kernel_name;
+    if(norm_info.is_in_map())
+    {
+        kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
+    }
+    else
+    {
+        if(data_layout == DataLayout::NCHW)
+        {
+            kernel_name = "normalization_layer_cross_map";
+        }
+        else
+        {
+            // 1D Cross-Map normalization in NHWC is the same as 1D In-Map normalization in NCHW
+            kernel_name = "normalization_layer_in_map_nchw";
+        }
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
diff --git a/tests/validation/CL/NormalizationLayer.cpp b/tests/validation/CL/NormalizationLayer.cpp
index 1087403b1c..fdfb225866 100644
--- a/tests/validation/CL/NormalizationLayer.cpp
+++ b/tests/validation/CL/NormalizationLayer.cpp
@@ -48,12 +48,13 @@ RelativeTolerance<half>  tolerance_f16(half(0.2));
 RelativeTolerance<float> tolerance_f32(0.05f);
 
 /** Input data set. */
-const auto NormalizationDatasetFP16 = combine(combine(combine(framework::dataset::make("NormType", { NormType::IN_MAP_1D, NormType::CROSS_MAP }), framework::dataset::make("NormalizationSize", 3, 9,
-                                                              2)),
+const auto NormalizationDatasetFP16 = combine(combine(combine(framework::dataset::make("NormType", { NormType::IN_MAP_1D, NormType::IN_MAP_2D, NormType::CROSS_MAP }),
+                                                              framework::dataset::make("NormalizationSize", 3, 9, 2)),
                                                       framework::dataset::make("Beta", { 0.5f, 1.f, 2.f })),
                                               framework::dataset::make("IsScaled", { true }));
 
-const auto NormalizationDatasetFP32 = combine(combine(combine(datasets::NormalizationTypes(), framework::dataset::make("NormalizationSize", 3, 9, 2)),
+const auto NormalizationDatasetFP32 = combine(combine(combine(framework::dataset::make("NormType", { NormType::IN_MAP_1D, NormType::IN_MAP_2D, NormType::CROSS_MAP }),
+                                                              framework::dataset::make("NormalizationSize", 3, 9, 2)),
                                                       framework::dataset::make("Beta", { 0.5f, 1.f, 2.f })),
                                               framework::dataset::make("IsScaled", { true, false }));
 } // namespace
@@ -100,14 +101,16 @@ using CLNormalizationLayerFixture = NormalizationValidationFixture<CLTensor, CLA
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), NormalizationDatasetFP16),
-                                                                                                               framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), NormalizationDatasetFP16),
+                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
+                                                                                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLNormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), NormalizationDatasetFP16),
-                                                                                                             framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLNormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), NormalizationDatasetFP16),
+                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
+                                                                                                             framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -115,14 +118,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLNormalizationLayerFixture<half>, framework::D
 TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), NormalizationDatasetFP32),
-                                                                                                                framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), NormalizationDatasetFP32),
+                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLNormalizationLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), NormalizationDatasetFP32),
-                                                                                                              framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLNormalizationLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), NormalizationDatasetFP32),
+                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
+                                                                                                              framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/GLES_COMPUTE/NormalizationLayer.cpp b/tests/validation/GLES_COMPUTE/NormalizationLayer.cpp
index 4bd931e420..67dca32ed8 100644
--- a/tests/validation/GLES_COMPUTE/NormalizationLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/NormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,12 +63,16 @@ using GCNormalizationLayerFixture = NormalizationValidationFixture<GCTensor, GCA
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, GCNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(NormalizationDataset,
+                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCNormalizationLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, GCNormalizationLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(NormalizationDataset,
+                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
+                                                                                                              framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/NEON/NormalizationLayer.cpp b/tests/validation/NEON/NormalizationLayer.cpp
index d8461519d4..f9b32b9259 100644
--- a/tests/validation/NEON/NormalizationLayer.cpp
+++ b/tests/validation/NEON/NormalizationLayer.cpp
@@ -102,12 +102,16 @@ using NENormalizationLayerFixture = NormalizationValidationFixture<Tensor, Acces
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(NormalizationDataset,
+                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
+                                                                                                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(NormalizationDataset,
+                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
+                                                                                                             framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -116,14 +120,16 @@ TEST_SUITE_END() // FP16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), NormalizationDatasetFP32),
-                                                                                                                framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), NormalizationDatasetFP32),
+                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), NormalizationDatasetFP32),
-                                                                                                              framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), NormalizationDatasetFP32),
+                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
+                                                                                                              framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/fixtures/NormalizationLayerFixture.h b/tests/validation/fixtures/NormalizationLayerFixture.h
index 318b77e1a7..4d6ef7019f 100644
--- a/tests/validation/fixtures/NormalizationLayerFixture.h
+++ b/tests/validation/fixtures/NormalizationLayerFixture.h
@@ -47,11 +47,11 @@ class NormalizationValidationGenericFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, NormType norm_type, int norm_size, float beta, bool is_scaled, DataType data_type)
+    void setup(TensorShape shape, NormType norm_type, int norm_size, float beta, bool is_scaled, DataType data_type, DataLayout data_layout)
     {
         NormalizationLayerInfo info(norm_type, norm_size, 5, beta, 1.f, is_scaled);
 
-        _target    = compute_target(shape, info, data_type);
+        _target    = compute_target(shape, info, data_type, data_layout);
         _reference = compute_reference(shape, info, data_type);
     }
 
@@ -63,11 +63,16 @@ protected:
         library->fill(tensor, distribution, 0);
     }
 
-    TensorType compute_target(const TensorShape &shape, NormalizationLayerInfo info, DataType data_type)
+    TensorType compute_target(TensorShape shape, NormalizationLayerInfo info, DataType data_type, DataLayout data_layout)
     {
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(shape, PermutationVector(2U, 0U, 1U));
+        }
+
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type, 1);
-        TensorType dst = create_tensor<TensorType>(shape, data_type, 1);
+        TensorType src = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), data_layout);
+        TensorType dst = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), data_layout);
 
         // Create and configure function
         FunctionType norm_layer;
@@ -112,9 +117,9 @@ class NormalizationValidationFixture : public NormalizationValidationGenericFixt
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, NormType norm_type, int norm_size, float beta, bool is_scaled, DataType data_type)
+    void setup(TensorShape shape, NormType norm_type, int norm_size, float beta, bool is_scaled, DataType data_type, DataLayout data_layout)
     {
-        NormalizationValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, norm_type, norm_size, beta, is_scaled, data_type);
+        NormalizationValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, norm_type, norm_size, beta, is_scaled, data_type, data_layout);
     }
 };
 } // namespace validation
diff --git a/tests/validation/reference/NormalizationLayer.cpp b/tests/validation/reference/NormalizationLayer.cpp
index e6ca233e75..d57e6f15a9 100644
--- a/tests/validation/reference/NormalizationLayer.cpp
+++ b/tests/validation/reference/NormalizationLayer.cpp
@@ -56,7 +56,7 @@ SimpleTensor<T> normalization_layer(const SimpleTensor<T> &src, NormalizationLay
     // IN_MAP_1D and CROSS_MAP normalize over a single axis only
     int radius_rows = (NormType::IN_MAP_2D == type) ? norm_size / 2 : 0;
 
-    if(type == NormType::CROSS_MAP)
+    if(info.is_cross_map())
     {
         // Remove also depth from upper dimensions since it is the dimension we
         // want to use for normalization
-- 
cgit v1.2.1