COMPMID-2943: Instance_Normalization VTS test on relaxed mode on GpuAcc

CLInstanceNormalizationLayer assumes there is no padding both on the input and output tensors. Although it is true that the operator itself does not add any padding, other layers might do it, therefore breaking the assumption. Change-Id: Ief7b74017ca0303bfb0670690f6d9ecbd4b9c29c Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/2336 Reviewed-by: Manuel Bottini <manuel.bottini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Michele Di Giorgio <michele.digiorgio@arm.com> 2019-11-20 15:06:52 +0000
committer: SiCong Li <sicong.li@arm.com> 2019-11-22 13:52:30 +0000
commit: f3551aee4ca3573280bfb3d65ad1ca864f9672d1 (patch)
tree: a9f25fe7a2cbaa3a93bac1ab9a48c423c30bf280
parent: 6ea2a42bfe2bcc8745d2fcc567f9b9bd7c593834 (diff)
download: ComputeLibrary-f3551aee4ca3573280bfb3d65ad1ca864f9672d1.tar.gz
1 files changed, 52 insertions, 41 deletions
diff --git a/src/core/CL/cl_kernels/instance_normalization.cl b/src/core/CL/cl_kernels/instance_normalization.cl
index 18afcc53a7..de7d57c69e 100644
--- a/src/core/CL/cl_kernels/instance_normalization.cl
+++ b/src/core/CL/cl_kernels/instance_normalization.cl
@@ -58,44 +58,58 @@ __kernel void instance_normalization(
 #endif /* IN_PLACE */
 )
 {
+    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+#ifndef IN_PLACE
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+#endif /* IN_PLACE */
+
     float sum    = 0.f;
     float sum_sq = 0.f;
 
 #if defined(NHWC)
 
-    const int pc             = get_global_id(0);
-    const int pn             = get_global_id(2);
+    const int ch             = get_global_id(0); // Current channel
+    const int batch          = get_global_id(2); // Current batch
     const int elements_plane = DIM_Y * DIM_Z;
-    const int elements_x_y   = DIM_X * DIM_Y;
-    const int elements_x_y_z = DIM_X * DIM_Y * DIM_Z;
 
     for(int i_w = 0; i_w < DIM_Y; ++i_w)
     {
         for(int i_h = 0; i_h < DIM_Z; ++i_h)
         {
-            float data = (float)*((__global DATA_TYPE *)input_ptr + pc + i_w * DIM_X + i_h * elements_x_y + pn * elements_x_y_z);
+            float data = (float) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
             sum += data;
             sum_sq += data * data;
         }
     }
 
 #else // !defined(NHWC)
+    const int ch             = get_global_id(2) % DIM_Z; // Current channel
+    const int batch          = get_global_id(2) / DIM_Z; // Current batch
     const int elements_plane = DIM_X * DIM_Y;
-    const int plane_address  = get_global_id(2) * elements_plane;
-    int       i              = 0;
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     part_sum = 0.f;
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     part_sum_sq = 0.f;
     // Calculate partial sum
-    for(; i <= (elements_plane - VEC_SIZE); i += VEC_SIZE)
+    for(int y = 0; y < DIM_Y; ++y)
     {
-        // Load data
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_ptr + i + plane_address);
-        part_sum += data;
-        part_sum_sq += data * data;
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            // Load data
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+            part_sum += data;
+            part_sum_sq += data * data;
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+            part_sum.s0 += data;
+            part_sum_sq.s0 += data * data;
+        }
     }
     // Perform reduction
 #if VEC_SIZE > 8
@@ -112,20 +126,13 @@ __kernel void instance_normalization(
 #endif // VEC_SIZE > 2
     part_sum.s0 += part_sum.s1;
     part_sum_sq.s0 += part_sum_sq.s1;
-    // Left-overs loop
-    for(; i < elements_plane; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)input_ptr + i + plane_address);
-        part_sum.s0 += data;
-        part_sum_sq.s0 += data * data;
-    }
 
     sum    = (float)part_sum.s0;
     sum_sq = (float)part_sum_sq.s0;
 
 #endif // defined(NHWC)
 
-    const float mean_float   = (sum / elements_plane);
+    const float     mean_float   = (sum / elements_plane);
     const DATA_TYPE mean         = (DATA_TYPE)mean_float;
     const float     var_float    = (sum_sq / elements_plane) - (mean_float * mean_float);
     const float     multip_float = GAMMA / sqrt(var_float + EPSILON);
@@ -137,44 +144,48 @@ __kernel void instance_normalization(
     {
         for(int i_h = 0; i_h < DIM_Z; ++i_h)
         {
-            __global DATA_TYPE *input_address = (__global DATA_TYPE *)input_ptr + pc + i_w * DIM_X + i_h * elements_x_y + pn * elements_x_y_z;
+            __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
 #ifdef IN_PLACE
             __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
-            __global DATA_TYPE *output_address = (__global DATA_TYPE *)output_ptr + pc + i_w * DIM_X + i_h * elements_x_y + pn * elements_x_y_z;
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
 #endif /* IN_PLACE */
             *(output_address) = (*(input_address) - mean) * multip + (DATA_TYPE)BETA;
         }
     }
 
 #else // !defined(NHWC)
-    i      = 0;
-    for(; i <= (elements_plane - VEC_SIZE); i += VEC_SIZE)
+    for(int y = 0; y < DIM_Y; ++y)
     {
-        __global DATA_TYPE *input_address  = (__global DATA_TYPE *)input_ptr + i + plane_address;
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 #ifdef IN_PLACE
-        __global DATA_TYPE *output_address = input_address;
+            __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
-        __global DATA_TYPE *output_address = (__global DATA_TYPE *)output_ptr + i + plane_address;
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 #endif /* IN_PLACE */
 
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_address);
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            data = VLOAD(VEC_SIZE)(0, input_address);
 
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        res = (data - mean) * multip + (DATA_TYPE)BETA;
-        VSTORE(VEC_SIZE)
-        (res, 0, (__global DATA_TYPE *)output_address);
-    }
-    for(; i < elements_plane; ++i)
-    {
-        __global DATA_TYPE *input_address  = (__global DATA_TYPE *)input_ptr + i + plane_address;
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            res = (data - mean) * multip + (DATA_TYPE)BETA;
+            VSTORE(VEC_SIZE)
+            (res, 0, output_address);
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 #ifdef IN_PLACE
-        __global DATA_TYPE *output_address = input_address;
+            __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
-        __global DATA_TYPE *output_address = (__global DATA_TYPE *)output_ptr + i + plane_address;
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 #endif /* IN_PLACE */
-        *(output_address)                  = (*(input_address) - mean) * multip + (DATA_TYPE)BETA;
+            *(output_address)                  = (*(input_address) - mean) * multip + (DATA_TYPE)BETA;
+        }
     }
 #endif // defined(NHWC)
 }
author	Michele Di Giorgio <michele.digiorgio@arm.com>	2019-11-20 15:06:52 +0000
committer	SiCong Li <sicong.li@arm.com>	2019-11-22 13:52:30 +0000
commit	f3551aee4ca3573280bfb3d65ad1ca864f9672d1 (patch)
tree	a9f25fe7a2cbaa3a93bac1ab9a48c423c30bf280
parent	6ea2a42bfe2bcc8745d2fcc567f9b9bd7c593834 (diff)
download	ComputeLibrary-f3551aee4ca3573280bfb3d65ad1ca864f9672d1.tar.gz