1 files changed, 52 insertions, 41 deletions
diff --git a/src/core/CL/cl_kernels/instance_normalization.cl b/src/core/CL/cl_kernels/instance_normalization.cl
index 18afcc53a7..de7d57c69e 100644
--- a/src/core/CL/cl_kernels/instance_normalization.cl
+++ b/src/core/CL/cl_kernels/instance_normalization.cl
@@ -58,44 +58,58 @@ __kernel void instance_normalization(
 #endif /* IN_PLACE */
 )
 {
+    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+#ifndef IN_PLACE
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+#endif /* IN_PLACE */
+
     float sum    = 0.f;
     float sum_sq = 0.f;
 
 #if defined(NHWC)
 
-    const int pc             = get_global_id(0);
-    const int pn             = get_global_id(2);
+    const int ch             = get_global_id(0); // Current channel
+    const int batch          = get_global_id(2); // Current batch
     const int elements_plane = DIM_Y * DIM_Z;
-    const int elements_x_y   = DIM_X * DIM_Y;
-    const int elements_x_y_z = DIM_X * DIM_Y * DIM_Z;
 
     for(int i_w = 0; i_w < DIM_Y; ++i_w)
     {
         for(int i_h = 0; i_h < DIM_Z; ++i_h)
         {
-            float data = (float)*((__global DATA_TYPE *)input_ptr + pc + i_w * DIM_X + i_h * elements_x_y + pn * elements_x_y_z);
+            float data = (float) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
             sum += data;
             sum_sq += data * data;
         }
     }
 
 #else // !defined(NHWC)
+    const int ch             = get_global_id(2) % DIM_Z; // Current channel
+    const int batch          = get_global_id(2) / DIM_Z; // Current batch
     const int elements_plane = DIM_X * DIM_Y;
-    const int plane_address  = get_global_id(2) * elements_plane;
-    int       i              = 0;
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     part_sum = 0.f;
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     part_sum_sq = 0.f;
     // Calculate partial sum
-    for(; i <= (elements_plane - VEC_SIZE); i += VEC_SIZE)
+    for(int y = 0; y < DIM_Y; ++y)
     {
-        // Load data
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_ptr + i + plane_address);
-        part_sum += data;
-        part_sum_sq += data * data;
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            // Load data
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+            part_sum += data;
+            part_sum_sq += data * data;
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
+            part_sum.s0 += data;
+            part_sum_sq.s0 += data * data;
+        }
     }
     // Perform reduction
 #if VEC_SIZE > 8
@@ -112,20 +126,13 @@ __kernel void instance_normalization(
 #endif // VEC_SIZE > 2
     part_sum.s0 += part_sum.s1;
     part_sum_sq.s0 += part_sum_sq.s1;
-    // Left-overs loop
-    for(; i < elements_plane; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)input_ptr + i + plane_address);
-        part_sum.s0 += data;
-        part_sum_sq.s0 += data * data;
-    }
 
     sum    = (float)part_sum.s0;
     sum_sq = (float)part_sum_sq.s0;
 
 #endif // defined(NHWC)
 
-    const float mean_float   = (sum / elements_plane);
+    const float     mean_float   = (sum / elements_plane);
     const DATA_TYPE mean         = (DATA_TYPE)mean_float;
     const float     var_float    = (sum_sq / elements_plane) - (mean_float * mean_float);
     const float     multip_float = GAMMA / sqrt(var_float + EPSILON);
@@ -137,44 +144,48 @@ __kernel void instance_normalization(
     {
         for(int i_h = 0; i_h < DIM_Z; ++i_h)
         {
-            __global DATA_TYPE *input_address = (__global DATA_TYPE *)input_ptr + pc + i_w * DIM_X + i_h * elements_x_y + pn * elements_x_y_z;
+            __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
 #ifdef IN_PLACE
             __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
-            __global DATA_TYPE *output_address = (__global DATA_TYPE *)output_ptr + pc + i_w * DIM_X + i_h * elements_x_y + pn * elements_x_y_z;
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
 #endif /* IN_PLACE */
             *(output_address) = (*(input_address) - mean) * multip + (DATA_TYPE)BETA;
         }
     }
 
 #else // !defined(NHWC)
-    i      = 0;
-    for(; i <= (elements_plane - VEC_SIZE); i += VEC_SIZE)
+    for(int y = 0; y < DIM_Y; ++y)
     {
-        __global DATA_TYPE *input_address  = (__global DATA_TYPE *)input_ptr + i + plane_address;
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 #ifdef IN_PLACE
-        __global DATA_TYPE *output_address = input_address;
+            __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
-        __global DATA_TYPE *output_address = (__global DATA_TYPE *)output_ptr + i + plane_address;
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 #endif /* IN_PLACE */
 
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_address);
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            data = VLOAD(VEC_SIZE)(0, input_address);
 
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        res = (data - mean) * multip + (DATA_TYPE)BETA;
-        VSTORE(VEC_SIZE)
-        (res, 0, (__global DATA_TYPE *)output_address);
-    }
-    for(; i < elements_plane; ++i)
-    {
-        __global DATA_TYPE *input_address  = (__global DATA_TYPE *)input_ptr + i + plane_address;
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            res = (data - mean) * multip + (DATA_TYPE)BETA;
+            VSTORE(VEC_SIZE)
+            (res, 0, output_address);
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 #ifdef IN_PLACE
-        __global DATA_TYPE *output_address = input_address;
+            __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
-        __global DATA_TYPE *output_address = (__global DATA_TYPE *)output_ptr + i + plane_address;
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 #endif /* IN_PLACE */
-        *(output_address)                  = (*(input_address) - mean) * multip + (DATA_TYPE)BETA;
+            *(output_address)                  = (*(input_address) - mean) * multip + (DATA_TYPE)BETA;
+        }
     }
 #endif // defined(NHWC)
 }