aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSheri Zhang <sheri.zhang@arm.com>2020-10-08 12:35:28 +0100
committerSheri Zhang <sheri.zhang@arm.com>2020-10-13 10:12:07 +0000
commit141c31a532efb20698466aaabbecc92639f05b0d (patch)
tree1ed97d68e8142c3b6ac4a8a9bab9fda8ab85fa9d /src
parent6a9e801743b85706177b01e1ffb31320222c4378 (diff)
downloadComputeLibrary-141c31a532efb20698466aaabbecc92639f05b0d.tar.gz
COMPMID-3705: Remove OpenCL padding: CLBatchNormalizationLayerKernel
Signed-off-by: Sheri Zhang <sheri.zhang@arm.com> Change-Id: If077a245156be69f34834cbfbd0a36e570ee4149 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4107 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/cl_kernels/batchnormalization_layer.cl43
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp67
2 files changed, 48 insertions, 62 deletions
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 16dbeaf2ad..89cbe4440e 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -129,7 +129,7 @@ __kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
res = MUL_OP(gamma_vec, x_bar);
#else /* USE_DEFAULT_GAMMA */
// gamma is equal to 1, no need to perform multiplications
- res = x_bar;
+ res = x_bar;
#endif /* USE_DEFAULT_GAMMA */
#ifndef USE_DEFAULT_BETA
@@ -198,19 +198,21 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
#endif /* USE_DEFAULT_GAMMA */
float epsilon)
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
#ifdef IN_PLACE
- Tensor3D out = in;
+ __global uchar *output_addr = input_ptr;
#else /* IN_PLACE */
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
#endif /* IN_PLACE */
- Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
- Vector var = CONVERT_TO_VECTOR_STRUCT(var);
+ __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+ __global uchar *var_addr = var_ptr + var_offset_first_element_in_bytes + x_offs;
#ifndef USE_DEFAULT_BETA
- Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+ __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
#endif /* USE_DEFAULT_BETA */
#ifndef USE_DEFAULT_GAMMA
- Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+ __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
#endif /* USE_DEFAULT_GAMMA */
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -222,40 +224,37 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
x_bar = 0;
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- res = 0;
-
- const int current_slice = get_global_id(0);
+ res0 = 0;
- data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
- denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(var.ptr + current_slice * VEC_SIZE * var.stride_x));
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+ denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
// Calculate x bar and store results
- numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * mean.stride_x));
+ numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
numerator = SUB_OP(data, numerator);
x_bar = MUL_OP(numerator, denominator);
#ifndef USE_DEFAULT_GAMMA
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(gamma.ptr + current_slice * VEC_SIZE * gamma.stride_x));
+ gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
- res = MUL_OP(gamma_vec, x_bar);
+ res0 = MUL_OP(gamma_vec, x_bar);
#else /* USE_DEFAULT_GAMMA */
// gamma is equal to 1, no need to perform multiplications
- res = x_bar;
+ res0 = x_bar;
#endif /* USE_DEFAULT_GAMMA */
#ifndef USE_DEFAULT_BETA
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(beta.ptr + current_slice * VEC_SIZE * beta.stride_x));
+ beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
// beta is not zero, hence we need to perform the addition
- res = ADD_OP(res, beta_vec);
+ res0 = ADD_OP(res0, beta_vec);
#endif /* USE_DEFAULT_BETA */
- res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
+ res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
- VSTORE(VEC_SIZE)
- (res, 0, (__global DATA_TYPE *)out.ptr);
+ STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
}
#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index a2cabcfd1f..1c1df6c4eb 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -80,16 +80,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
- ITensorInfo *mean, ITensorInfo *var, ITensorInfo *beta, ITensorInfo *gamma)
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
{
- if(output != nullptr)
- {
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, *input->clone());
- }
-
- const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
// Configure kernel window
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -107,25 +100,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
window_changed = update_window_and_padding(win, input_access);
}
- // Mean, var, gamma and beta get parallelized for the NHWC case as they follow the channel dimension, which is along the first axis
- if(input->data_layout() == DataLayout::NHWC)
- {
- AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, mean_access, var_access);
-
- if(beta != nullptr)
- {
- AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, beta_access);
- }
- if(gamma != nullptr)
- {
- AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, gamma_access);
- }
- }
-
Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
@@ -162,12 +136,13 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
(gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+ unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
// Set build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -191,13 +166,24 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
}
_kernel.setArg<cl_float>(idx++, _epsilon);
+ if(output != nullptr)
+ {
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
+ }
+
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(),
- mean->info(), var->info(),
- (beta != nullptr) ? beta->info() : nullptr,
- (gamma != nullptr) ? gamma->info() : nullptr);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
+ if(input->info()->data_layout() == DataLayout::NHWC)
+ {
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ ICLKernel::configure_internal(win);
+ }
+ else
+ {
+ auto win_config = validate_and_configure_window_nchw(input->info(), (_run_in_place) ? nullptr : output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+ }
_config_id = "batch_normalization_layer_";
_config_id += string_from_data_type(input->info()->data_type());
@@ -218,11 +204,12 @@ Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(),
- mean->clone().get(), var->clone().get(),
- (beta != nullptr) ? beta->clone().get() : nullptr,
- (gamma != nullptr) ? gamma->clone().get() : nullptr)
- .first);
+
+ if(input->data_layout() != DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+ .first);
+ }
return Status{};
}