From c93691717a6e7ca67e32b4dedd233b8c63b6daf2 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 26 Sep 2018 11:25:40 +0100
Subject: COMPMID-1523: Fuse BN node with convolution.

Change-Id: I146936c9e98b343496a4b61cdbadf0eaa38e885a
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/154008
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/batchnormalization_layer.cl | 162 ++++++++++++++++++++-
 1 file changed, 160 insertions(+), 2 deletions(-)

(limited to 'src/core/CL/cl_kernels/batchnormalization_layer.cl')

diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 5352af3c5a..df141269bc 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -23,14 +23,14 @@
  */
 #include "helpers.h"
 
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
 #define ADD_OP(a, b) ((a) + (b))
 #define SUB_OP(a, b) ((a) - (b))
 #define MUL_OP(a, b) ((a) * (b))
 #define INVSQRT_OP(a) rsqrt((a))
 #define SQCVT_SAT(a) (a)
 
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+
 #if defined(FUSED_ACTIVATION)
 #include "activation_layer.cl"
 #define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
@@ -258,3 +258,161 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
     (res, 0, (__global DATA_TYPE *)out.ptr);
 }
 #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */
+
+#if defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON)
+/** Fuse batchnorm parameters to convolution layer parameters
+ *
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DNUM_CHANNELS=size. e.g. -DNUM_CHANNELS=16
+ * @attention Batch normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  conv_w_ptr                             Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  conv_w_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  conv_w_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  conv_w_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  conv_w_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  conv_w_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  conv_w_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  conv_w__stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  conv_w__step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  conv_w_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  bn_mean_ptr                            Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  bn_mean_stride_x                       Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  bn_mean_step_x                         bn_mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bn_mean_offset_first_element_in_bytes  The offset of the first element in the mean source tensor
+ * @param[in]  bn_var_ptr                             Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  bn_var_stride_x                        Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  bn_var_step_x                          bn_var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bn_var_offset_first_element_in_bytes   The offset of the first element in the var source tensor
+ * @param[out] fused_w_ptr                            Pointer to the destination weights tensors. Supported data types: same as @p input_ptr
+ * @param[in]  fused_w_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  fused_w_step_x                         fused_w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  fused_w_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  fused_w_step_y                         fused_w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  fused_w_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  fused_w_step_z                         fused_w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  fused_w_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  fused_w_step_w                         fused_w_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  fused_w_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  fused_b_ptr                            Pointer to the destination bias tensor. Supported data types: same as @p input_ptr
+ * @param[in]  fused_b_stride_x                       Stride of the bias source tensor in X dimension (in bytes)
+ * @param[in]  fused_b_step_x                         fused_b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  fused_b_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  conv_b_ptr                             Pointer to the source bias tensor. Supported data types: same as @p input_ptr
+ * @param[in]  conv_b_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  conv_b_step_x                          conv_b_beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  conv_b_offset_first_element_in_bytes   The offset of the first element in the source bias tensor
+ * @param[in]  bn_beta_ptr                            Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  bn_beta_stride_x                       Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  bn_beta_step_x                         bn_beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bn_beta_offset_first_element_in_bytes  The offset of the first element in the beta source tensor
+ * @param[in]  bn_gamma_ptr                           Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  bn_gamma_stride_x                      Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  bn_gamma_step_x                        bn_gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bn_gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                                Epsilon parameter in the batch normalization equation
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR4D_DECLARATION(conv_w),
+                                            VECTOR_DECLARATION(bn_mean),
+                                            VECTOR_DECLARATION(bn_var)
+#ifndef IN_PLACE_W
+                                            ,
+                                            TENSOR4D_DECLARATION(fused_w)
+#endif /* not IN_PLACE_W */
+#ifndef IN_PLACE_B
+                                            ,
+                                            VECTOR_DECLARATION(fused_b)
+#endif /* not IN_PLACE_B */
+#ifdef HAS_BIAS
+                                            ,
+                                            VECTOR_DECLARATION(conv_b)
+#endif /* HAS_BIAS */
+#ifndef USE_DEFAULT_BETA
+                                            ,
+                                            VECTOR_DECLARATION(bn_beta)
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            ,
+                                            VECTOR_DECLARATION(bn_gamma)
+#endif /* USE_DEFAULT_GAMMA */
+                                           )
+{
+    Tensor4D conv_w  = CONVERT_TO_TENSOR4D_STRUCT(conv_w, NUM_CHANNELS);
+    Vector   bn_mean = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_mean);
+    Vector   bn_var  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_var);
+
+    // In-place ops
+#ifdef IN_PLACE_W
+    Tensor4D fused_w = conv_w;
+#else  /* IN_PLACE_W */
+    Tensor4D  fused_w                      = CONVERT_TO_TENSOR4D_STRUCT(fused_w, NUM_CHANNELS);
+#endif /* IN_PLACE */
+#ifdef IN_PLACE_B
+    Vector fused_b = conv_b;
+#else  /* IN_PLACE_W */
+    Vector    fused_b                      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(fused_b);
+#endif /* IN_PLACE */
+
+    // Conditional ops
+#ifdef HAS_BIAS
+    Vector conv_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(conv_b);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_BETA
+    Vector bn_beta = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    Vector bn_gamma = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+    const int current_slice = get_global_id(2) / NUM_CHANNELS;
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    conv_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * conv_w_stride_x;
+    fused_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * fused_w_stride_x;
+
+    // Load W
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    wn = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)conv_w.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    DATA_TYPE wn                           = *((__global DATA_TYPE *)(conv_w.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+    // rvar = 1 / sqrt(var + epsilon)
+    const DATA_TYPE var  = *((__global DATA_TYPE *)(bn_var.ptr + current_slice * bn_var.stride_x));
+    const DATA_TYPE rvar = INVSQRT_OP(ADD_OP(var, SQCVT_SAT((float)EPSILON)));
+    wn *= rvar;
+
+    // Load b
+    const DATA_TYPE mean = *((__global DATA_TYPE *)(bn_mean.ptr + current_slice * bn_mean.stride_x));
+    DATA_TYPE bn         = 0;
+#ifdef HAS_BIAS
+    bn = *((__global DATA_TYPE *)(conv_b.ptr + current_slice * conv_b.stride_x));
+#endif /* HAS_BIAS */
+    bn = (bn - mean) * rvar;
+
+#ifndef USE_DEFAULT_GAMMA
+    const DATA_TYPE gamma_scalar = *((__global DATA_TYPE *)(bn_gamma.ptr + current_slice * bn_gamma.stride_x));
+    wn *= gamma_scalar;
+    bn *= gamma_scalar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    const DATA_TYPE beta_scalar = *((__global DATA_TYPE *)(bn_beta.ptr + current_slice * bn_beta.stride_x));
+    bn += beta_scalar;
+#endif /* USE_DEFAULT_BETA */
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Store updated weights
+    VSTORE(VEC_SIZE)
+    (wn, 0, (__global DATA_TYPE *)fused_w.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(fused_w.ptr)) = wn;
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+    // Store updated bias
+    *((__global DATA_TYPE *)(fused_b.ptr + current_slice * fused_b.stride_x)) = bn;
+}
+#endif /* defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON) */
-- 
cgit v1.2.1