From c93691717a6e7ca67e32b4dedd233b8c63b6daf2 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 26 Sep 2018 11:25:40 +0100 Subject: COMPMID-1523: Fuse BN node with convolution. Change-Id: I146936c9e98b343496a4b61cdbadf0eaa38e885a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/154008 Reviewed-by: Michele DiGiorgio Reviewed-by: Giuseppe Rossini Tested-by: bsgcomp --- src/core/CL/cl_kernels/batchnormalization_layer.cl | 162 ++++++++++++++++++++- 1 file changed, 160 insertions(+), 2 deletions(-) (limited to 'src/core/CL/cl_kernels/batchnormalization_layer.cl') diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl index 5352af3c5a..df141269bc 100644 --- a/src/core/CL/cl_kernels/batchnormalization_layer.cl +++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl @@ -23,14 +23,14 @@ */ #include "helpers.h" -#if defined(VEC_SIZE) && defined(DATA_TYPE) - #define ADD_OP(a, b) ((a) + (b)) #define SUB_OP(a, b) ((a) - (b)) #define MUL_OP(a, b) ((a) * (b)) #define INVSQRT_OP(a) rsqrt((a)) #define SQCVT_SAT(a) (a) +#if defined(VEC_SIZE) && defined(DATA_TYPE) + #if defined(FUSED_ACTIVATION) #include "activation_layer.cl" #define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x) @@ -258,3 +258,161 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input), (res, 0, (__global DATA_TYPE *)out.ptr); } #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */ + +#if defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON) +/** Fuse batchnorm parameters to convolution layer parameters + * + * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DNUM_CHANNELS=size. e.g. -DNUM_CHANNELS=16 + * @attention Batch normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f + * + * @param[in] conv_w_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] conv_w_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] conv_w_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] conv_w_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] conv_w_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] conv_w_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] conv_w_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] conv_w__stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] conv_w__step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] conv_w_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] bn_mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr + * @param[in] bn_mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] bn_mean_step_x bn_mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bn_mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] bn_var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr + * @param[in] bn_var_stride_x Stride of the var tensor in X dimension (in bytes) + * @param[in] bn_var_step_x bn_var_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bn_var_offset_first_element_in_bytes The offset of the first element in the var source tensor + * @param[out] fused_w_ptr Pointer to the destination weights tensors. Supported data types: same as @p input_ptr + * @param[in] fused_w_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] fused_w_step_x fused_w_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] fused_w_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] fused_w_step_y fused_w_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] fused_w_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] fused_w_step_z fused_w_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] fused_w_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] fused_w_step_w fused_w_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] fused_w_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] fused_b_ptr Pointer to the destination bias tensor. Supported data types: same as @p input_ptr + * @param[in] fused_b_stride_x Stride of the bias source tensor in X dimension (in bytes) + * @param[in] fused_b_step_x fused_b_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] fused_b_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] conv_b_ptr Pointer to the source bias tensor. Supported data types: same as @p input_ptr + * @param[in] conv_b_stride_x Stride of the beta source tensor in X dimension (in bytes) + * @param[in] conv_b_step_x conv_b_beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] conv_b_offset_first_element_in_bytes The offset of the first element in the source bias tensor + * @param[in] bn_beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr + * @param[in] bn_beta_stride_x Stride of the beta source tensor in X dimension (in bytes) + * @param[in] bn_beta_step_x bn_beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bn_beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor + * @param[in] bn_gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr + * @param[in] bn_gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) + * @param[in] bn_gamma_step_x bn_gamma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bn_gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor + * @param[in] epsilon Epsilon parameter in the batch normalization equation + */ +__kernel void fuse_batchnormalization_layer(TENSOR4D_DECLARATION(conv_w), + VECTOR_DECLARATION(bn_mean), + VECTOR_DECLARATION(bn_var) +#ifndef IN_PLACE_W + , + TENSOR4D_DECLARATION(fused_w) +#endif /* not IN_PLACE_W */ +#ifndef IN_PLACE_B + , + VECTOR_DECLARATION(fused_b) +#endif /* not IN_PLACE_B */ +#ifdef HAS_BIAS + , + VECTOR_DECLARATION(conv_b) +#endif /* HAS_BIAS */ +#ifndef USE_DEFAULT_BETA + , + VECTOR_DECLARATION(bn_beta) +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + , + VECTOR_DECLARATION(bn_gamma) +#endif /* USE_DEFAULT_GAMMA */ + ) +{ + Tensor4D conv_w = CONVERT_TO_TENSOR4D_STRUCT(conv_w, NUM_CHANNELS); + Vector bn_mean = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_mean); + Vector bn_var = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_var); + + // In-place ops +#ifdef IN_PLACE_W + Tensor4D fused_w = conv_w; +#else /* IN_PLACE_W */ + Tensor4D fused_w = CONVERT_TO_TENSOR4D_STRUCT(fused_w, NUM_CHANNELS); +#endif /* IN_PLACE */ +#ifdef IN_PLACE_B + Vector fused_b = conv_b; +#else /* IN_PLACE_W */ + Vector fused_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(fused_b); +#endif /* IN_PLACE */ + + // Conditional ops +#ifdef HAS_BIAS + Vector conv_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(conv_b); +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_BETA + Vector bn_beta = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_beta); +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + Vector bn_gamma = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_gamma); +#endif /* USE_DEFAULT_GAMMA */ + + const int current_slice = get_global_id(2) / NUM_CHANNELS; + +#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + conv_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * conv_w_stride_x; + fused_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * fused_w_stride_x; + + // Load W + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + wn = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)conv_w.ptr); +#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) + DATA_TYPE wn = *((__global DATA_TYPE *)(conv_w.ptr)); +#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + + // rvar = 1 / sqrt(var + epsilon) + const DATA_TYPE var = *((__global DATA_TYPE *)(bn_var.ptr + current_slice * bn_var.stride_x)); + const DATA_TYPE rvar = INVSQRT_OP(ADD_OP(var, SQCVT_SAT((float)EPSILON))); + wn *= rvar; + + // Load b + const DATA_TYPE mean = *((__global DATA_TYPE *)(bn_mean.ptr + current_slice * bn_mean.stride_x)); + DATA_TYPE bn = 0; +#ifdef HAS_BIAS + bn = *((__global DATA_TYPE *)(conv_b.ptr + current_slice * conv_b.stride_x)); +#endif /* HAS_BIAS */ + bn = (bn - mean) * rvar; + +#ifndef USE_DEFAULT_GAMMA + const DATA_TYPE gamma_scalar = *((__global DATA_TYPE *)(bn_gamma.ptr + current_slice * bn_gamma.stride_x)); + wn *= gamma_scalar; + bn *= gamma_scalar; +#endif /* USE_DEFAULT_GAMMA */ + +#ifndef USE_DEFAULT_BETA + const DATA_TYPE beta_scalar = *((__global DATA_TYPE *)(bn_beta.ptr + current_slice * bn_beta.stride_x)); + bn += beta_scalar; +#endif /* USE_DEFAULT_BETA */ + +#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + // Store updated weights + VSTORE(VEC_SIZE) + (wn, 0, (__global DATA_TYPE *)fused_w.ptr); +#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE *)(fused_w.ptr)) = wn; +#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + + // Store updated bias + *((__global DATA_TYPE *)(fused_b.ptr + current_slice * fused_b.stride_x)) = bn; +} +#endif /* defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON) */ -- cgit v1.2.1