From 5538d3436d6415d428b37d0f34ec81ca48cdff1a Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Wed, 14 Nov 2018 08:10:13 +0000
Subject: COMPMID-1781 Add channel support in CLL2Normalization

Change-Id: Ibab049f09413258c99335b7da6b151530a1bd136
---
 src/core/CL/CLKernelLibrary.cpp                  |  5 +-
 src/core/CL/cl_kernels/l2_normalize.cl           | 59 ++++++++++++++++++++++--
 src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp | 45 +++++++++++++++---
 3 files changed, 97 insertions(+), 12 deletions(-)

(limited to 'src/core/CL')

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 847236925a..3c2528f358 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -303,8 +303,9 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "IYUV_to_RGB888_bt709", "color_convert.cl" },
     { "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
     { "IYUV_to_YUV444_bt709", "color_convert.cl" },
-    { "l2_normalize_nchw", "l2_normalize.cl" },
-    { "l2_normalize_nhwc", "l2_normalize.cl" },
+    { "l2_normalize_x", "l2_normalize.cl" },
+    { "l2_normalize_y", "l2_normalize.cl" },
+    { "l2_normalize_z", "l2_normalize.cl" },
     { "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
     { "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
     { "magnitude_phase", "magnitude_phase.cl" },
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
index d230487030..5f66efbcc4 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/l2_normalize.cl
@@ -23,7 +23,7 @@
  */
 #include "helpers.h"
 
-/** This kernel performs l2 normalization. (NCHW)
+/** This kernel performs l2 normalization on x-axis
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
@@ -42,7 +42,7 @@
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  epsilon                           Epsilon value
  */
-__kernel void l2_normalize_nchw(
+__kernel void l2_normalize_x(
     VECTOR_DECLARATION(src),
     VECTOR_DECLARATION(sum),
     VECTOR_DECLARATION(dst),
@@ -60,7 +60,7 @@ __kernel void l2_normalize_nchw(
     vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
 }
 
-/** This kernel performs l2 normalization. (NHWC)
+/** This kernel performs l2 normalization on y-axis.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
@@ -85,7 +85,7 @@ __kernel void l2_normalize_nchw(
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  epsilon                           Epsilon value
  */
-__kernel void l2_normalize_nhwc(
+__kernel void l2_normalize_y(
     IMAGE_DECLARATION(src),
     IMAGE_DECLARATION(sum),
     IMAGE_DECLARATION(dst),
@@ -103,5 +103,56 @@ __kernel void l2_normalize_nhwc(
     VEC_DATA_TYPE(DATA_TYPE, 16)
     normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
 
+    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+/** This kernel performs l2 normalization on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                           Epsilon value
+ */
+__kernel void l2_normalize_z(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst),
+    DATA_TYPE epsilon)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D sum = CONVERT_TO_TENSOR3D_STRUCT(sum);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, (__global DATA_TYPE *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
+
     vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
 }
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index cfd04ef392..97dd919d08 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -109,11 +109,28 @@ void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor
     build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
 
     // Create kernel
-    const DataLayout data_layout = input->info()->data_layout();
-    _kernel                      = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize_" + lower_string(string_from_data_layout(data_layout)), build_opts));
+    std::string  kernel_name;
+    unsigned int idx = 0;
+    switch(axis)
+    {
+        case 0:
+            kernel_name = "x";
+            idx         = num_arguments_per_1D_tensor() * 3;
+            break;
+        case 1:
+            kernel_name = "y";
+            idx         = num_arguments_per_2D_tensor() * 3;
+            break;
+        case 2:
+            kernel_name = "z";
+            idx         = num_arguments_per_3D_tensor() * 3;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize_" + kernel_name, build_opts));
 
     // Set epsilon argument
-    unsigned int idx = data_layout == DataLayout::NCHW ? num_arguments_per_1D_tensor() * 3 : num_arguments_per_2D_tensor() * 3;
     if(input->info()->data_type() == DataType::F32)
     {
         _kernel.setArg<cl_uint>(idx, _epsilon);
@@ -145,9 +162,9 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
 
     Window window_sum(window);
 
-    switch(_input->info()->data_layout())
+    switch(_axis)
     {
-        case DataLayout::NCHW:
+        case 0:
         {
             window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
             Window in_slice  = window.first_slice_window_1D();
@@ -163,7 +180,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
             while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
         }
         break;
-        case DataLayout::NHWC:
+        case 1:
         {
             window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
             Window in_slice  = window.first_slice_window_2D();
@@ -179,6 +196,22 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
             while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
+        case 2:
+        {
+            window_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
+            Window in_slice  = window.first_slice_window_3D();
+            Window sum_slice = window_sum.first_slice_window_3D();
+            do
+            {
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, in_slice);
+                add_3D_tensor_argument(idx, _sum, sum_slice);
+                add_3D_tensor_argument(idx, _output, in_slice);
+                enqueue(queue, *this, in_slice);
+            }
+            while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+        }
+        break;
         default:
             ARM_COMPUTE_ERROR("Not supported");
     }
-- 
cgit v1.2.1