aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/batchnormalization_layer.cl
diff options
context:
space:
mode:
authorSheri Zhang <sheri.zhang@arm.com>2020-10-08 12:35:28 +0100
committerSheri Zhang <sheri.zhang@arm.com>2020-10-13 10:12:07 +0000
commit141c31a532efb20698466aaabbecc92639f05b0d (patch)
tree1ed97d68e8142c3b6ac4a8a9bab9fda8ab85fa9d /src/core/CL/cl_kernels/batchnormalization_layer.cl
parent6a9e801743b85706177b01e1ffb31320222c4378 (diff)
downloadComputeLibrary-141c31a532efb20698466aaabbecc92639f05b0d.tar.gz
COMPMID-3705: Remove OpenCL padding: CLBatchNormalizationLayerKernel
Signed-off-by: Sheri Zhang <sheri.zhang@arm.com> Change-Id: If077a245156be69f34834cbfbd0a36e570ee4149 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4107 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/batchnormalization_layer.cl')
-rw-r--r--src/core/CL/cl_kernels/batchnormalization_layer.cl43
1 files changed, 21 insertions, 22 deletions
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 16dbeaf2ad..89cbe4440e 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -129,7 +129,7 @@ __kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
res = MUL_OP(gamma_vec, x_bar);
#else /* USE_DEFAULT_GAMMA */
// gamma is equal to 1, no need to perform multiplications
- res = x_bar;
+ res = x_bar;
#endif /* USE_DEFAULT_GAMMA */
#ifndef USE_DEFAULT_BETA
@@ -198,19 +198,21 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
#endif /* USE_DEFAULT_GAMMA */
float epsilon)
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
#ifdef IN_PLACE
- Tensor3D out = in;
+ __global uchar *output_addr = input_ptr;
#else /* IN_PLACE */
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
#endif /* IN_PLACE */
- Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
- Vector var = CONVERT_TO_VECTOR_STRUCT(var);
+ __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+ __global uchar *var_addr = var_ptr + var_offset_first_element_in_bytes + x_offs;
#ifndef USE_DEFAULT_BETA
- Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+ __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
#endif /* USE_DEFAULT_BETA */
#ifndef USE_DEFAULT_GAMMA
- Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+ __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
#endif /* USE_DEFAULT_GAMMA */
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -222,40 +224,37 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
x_bar = 0;
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- res = 0;
-
- const int current_slice = get_global_id(0);
+ res0 = 0;
- data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
- denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(var.ptr + current_slice * VEC_SIZE * var.stride_x));
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+ denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
// Calculate x bar and store results
- numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * mean.stride_x));
+ numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
numerator = SUB_OP(data, numerator);
x_bar = MUL_OP(numerator, denominator);
#ifndef USE_DEFAULT_GAMMA
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(gamma.ptr + current_slice * VEC_SIZE * gamma.stride_x));
+ gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
- res = MUL_OP(gamma_vec, x_bar);
+ res0 = MUL_OP(gamma_vec, x_bar);
#else /* USE_DEFAULT_GAMMA */
// gamma is equal to 1, no need to perform multiplications
- res = x_bar;
+ res0 = x_bar;
#endif /* USE_DEFAULT_GAMMA */
#ifndef USE_DEFAULT_BETA
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(beta.ptr + current_slice * VEC_SIZE * beta.stride_x));
+ beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
// beta is not zero, hence we need to perform the addition
- res = ADD_OP(res, beta_vec);
+ res0 = ADD_OP(res0, beta_vec);
#endif /* USE_DEFAULT_BETA */
- res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
+ res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
- VSTORE(VEC_SIZE)
- (res, 0, (__global DATA_TYPE *)out.ptr);
+ STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/