aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/batchnormalization_layer.cl
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-09-26 11:25:40 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:55:45 +0000
commitc93691717a6e7ca67e32b4dedd233b8c63b6daf2 (patch)
treed3929606b525e89f60299b16f95eb4223d11d5a8 /src/core/CL/cl_kernels/batchnormalization_layer.cl
parente6dbde0128bf33b5d72a00c480bd92c290fd17b7 (diff)
downloadComputeLibrary-c93691717a6e7ca67e32b4dedd233b8c63b6daf2.tar.gz
COMPMID-1523: Fuse BN node with convolution.
Change-Id: I146936c9e98b343496a4b61cdbadf0eaa38e885a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/154008 Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Tested-by: bsgcomp <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/batchnormalization_layer.cl')
-rw-r--r--src/core/CL/cl_kernels/batchnormalization_layer.cl162
1 files changed, 160 insertions, 2 deletions
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 5352af3c5a..df141269bc 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -23,14 +23,14 @@
*/
#include "helpers.h"
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
#define ADD_OP(a, b) ((a) + (b))
#define SUB_OP(a, b) ((a) - (b))
#define MUL_OP(a, b) ((a) * (b))
#define INVSQRT_OP(a) rsqrt((a))
#define SQCVT_SAT(a) (a)
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+
#if defined(FUSED_ACTIVATION)
#include "activation_layer.cl"
#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
@@ -258,3 +258,161 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
(res, 0, (__global DATA_TYPE *)out.ptr);
}
#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */
+
+#if defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON)
+/** Fuse batchnorm parameters to convolution layer parameters
+ *
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DNUM_CHANNELS=size. e.g. -DNUM_CHANNELS=16
+ * @attention Batch normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in] conv_w_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] conv_w_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] conv_w_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] conv_w_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] conv_w_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] conv_w_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] conv_w_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] conv_w__stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] conv_w__step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] conv_w_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] bn_mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in] bn_mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] bn_mean_step_x bn_mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bn_mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] bn_var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in] bn_var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] bn_var_step_x bn_var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bn_var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[out] fused_w_ptr Pointer to the destination weights tensors. Supported data types: same as @p input_ptr
+ * @param[in] fused_w_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] fused_w_step_x fused_w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] fused_w_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] fused_w_step_y fused_w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] fused_w_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] fused_w_step_z fused_w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] fused_w_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] fused_w_step_w fused_w_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] fused_w_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] fused_b_ptr Pointer to the destination bias tensor. Supported data types: same as @p input_ptr
+ * @param[in] fused_b_stride_x Stride of the bias source tensor in X dimension (in bytes)
+ * @param[in] fused_b_step_x fused_b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] fused_b_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] conv_b_ptr Pointer to the source bias tensor. Supported data types: same as @p input_ptr
+ * @param[in] conv_b_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] conv_b_step_x conv_b_beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] conv_b_offset_first_element_in_bytes The offset of the first element in the source bias tensor
+ * @param[in] bn_beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in] bn_beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] bn_beta_step_x bn_beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bn_beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] bn_gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in] bn_gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] bn_gamma_step_x bn_gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bn_gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] epsilon Epsilon parameter in the batch normalization equation
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR4D_DECLARATION(conv_w),
+ VECTOR_DECLARATION(bn_mean),
+ VECTOR_DECLARATION(bn_var)
+#ifndef IN_PLACE_W
+ ,
+ TENSOR4D_DECLARATION(fused_w)
+#endif /* not IN_PLACE_W */
+#ifndef IN_PLACE_B
+ ,
+ VECTOR_DECLARATION(fused_b)
+#endif /* not IN_PLACE_B */
+#ifdef HAS_BIAS
+ ,
+ VECTOR_DECLARATION(conv_b)
+#endif /* HAS_BIAS */
+#ifndef USE_DEFAULT_BETA
+ ,
+ VECTOR_DECLARATION(bn_beta)
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ ,
+ VECTOR_DECLARATION(bn_gamma)
+#endif /* USE_DEFAULT_GAMMA */
+ )
+{
+ Tensor4D conv_w = CONVERT_TO_TENSOR4D_STRUCT(conv_w, NUM_CHANNELS);
+ Vector bn_mean = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_mean);
+ Vector bn_var = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_var);
+
+ // In-place ops
+#ifdef IN_PLACE_W
+ Tensor4D fused_w = conv_w;
+#else /* IN_PLACE_W */
+ Tensor4D fused_w = CONVERT_TO_TENSOR4D_STRUCT(fused_w, NUM_CHANNELS);
+#endif /* IN_PLACE */
+#ifdef IN_PLACE_B
+ Vector fused_b = conv_b;
+#else /* IN_PLACE_W */
+ Vector fused_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(fused_b);
+#endif /* IN_PLACE */
+
+ // Conditional ops
+#ifdef HAS_BIAS
+ Vector conv_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(conv_b);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_BETA
+ Vector bn_beta = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ Vector bn_gamma = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+ const int current_slice = get_global_id(2) / NUM_CHANNELS;
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ conv_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * conv_w_stride_x;
+ fused_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * fused_w_stride_x;
+
+ // Load W
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ wn = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)conv_w.ptr);
+#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+ DATA_TYPE wn = *((__global DATA_TYPE *)(conv_w.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+ // rvar = 1 / sqrt(var + epsilon)
+ const DATA_TYPE var = *((__global DATA_TYPE *)(bn_var.ptr + current_slice * bn_var.stride_x));
+ const DATA_TYPE rvar = INVSQRT_OP(ADD_OP(var, SQCVT_SAT((float)EPSILON)));
+ wn *= rvar;
+
+ // Load b
+ const DATA_TYPE mean = *((__global DATA_TYPE *)(bn_mean.ptr + current_slice * bn_mean.stride_x));
+ DATA_TYPE bn = 0;
+#ifdef HAS_BIAS
+ bn = *((__global DATA_TYPE *)(conv_b.ptr + current_slice * conv_b.stride_x));
+#endif /* HAS_BIAS */
+ bn = (bn - mean) * rvar;
+
+#ifndef USE_DEFAULT_GAMMA
+ const DATA_TYPE gamma_scalar = *((__global DATA_TYPE *)(bn_gamma.ptr + current_slice * bn_gamma.stride_x));
+ wn *= gamma_scalar;
+ bn *= gamma_scalar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+ const DATA_TYPE beta_scalar = *((__global DATA_TYPE *)(bn_beta.ptr + current_slice * bn_beta.stride_x));
+ bn += beta_scalar;
+#endif /* USE_DEFAULT_BETA */
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+ // Store updated weights
+ VSTORE(VEC_SIZE)
+ (wn, 0, (__global DATA_TYPE *)fused_w.ptr);
+#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE *)(fused_w.ptr)) = wn;
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+ // Store updated bias
+ *((__global DATA_TYPE *)(fused_b.ptr + current_slice * fused_b.stride_x)) = bn;
+}
+#endif /* defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON) */