From 141c31a532efb20698466aaabbecc92639f05b0d Mon Sep 17 00:00:00 2001 From: Sheri Zhang Date: Thu, 8 Oct 2020 12:35:28 +0100 Subject: COMPMID-3705: Remove OpenCL padding: CLBatchNormalizationLayerKernel Signed-off-by: Sheri Zhang Change-Id: If077a245156be69f34834cbfbd0a36e570ee4149 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4107 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Giorgio Arena --- src/core/CL/cl_kernels/batchnormalization_layer.cl | 43 +++++++------- .../CL/kernels/CLBatchNormalizationLayerKernel.cpp | 67 +++++++++------------- tests/validation/CL/BatchNormalizationLayer.cpp | 33 +++++++++++ 3 files changed, 81 insertions(+), 62 deletions(-) diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl index 16dbeaf2ad..89cbe4440e 100644 --- a/src/core/CL/cl_kernels/batchnormalization_layer.cl +++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl @@ -129,7 +129,7 @@ __kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input), res = MUL_OP(gamma_vec, x_bar); #else /* USE_DEFAULT_GAMMA */ // gamma is equal to 1, no need to perform multiplications - res = x_bar; + res = x_bar; #endif /* USE_DEFAULT_GAMMA */ #ifndef USE_DEFAULT_BETA @@ -198,19 +198,21 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input), #endif /* USE_DEFAULT_GAMMA */ float epsilon) { - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0); + + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z; #ifdef IN_PLACE - Tensor3D out = in; + __global uchar *output_addr = input_ptr; #else /* IN_PLACE */ - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z; #endif /* IN_PLACE */ - Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); - Vector var = CONVERT_TO_VECTOR_STRUCT(var); + __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs; + __global uchar *var_addr = var_ptr + var_offset_first_element_in_bytes + x_offs; #ifndef USE_DEFAULT_BETA - Vector beta = CONVERT_TO_VECTOR_STRUCT(beta); + __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs; #endif /* USE_DEFAULT_BETA */ #ifndef USE_DEFAULT_GAMMA - Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma); + __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs; #endif /* USE_DEFAULT_GAMMA */ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) @@ -222,40 +224,37 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) x_bar = 0; VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - res = 0; - - const int current_slice = get_global_id(0); + res0 = 0; - data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr); - denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(var.ptr + current_slice * VEC_SIZE * var.stride_x)); + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr); + denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr); denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon)))); // Calculate x bar and store results - numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * mean.stride_x)); + numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr); numerator = SUB_OP(data, numerator); x_bar = MUL_OP(numerator, denominator); #ifndef USE_DEFAULT_GAMMA VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(gamma.ptr + current_slice * VEC_SIZE * gamma.stride_x)); + gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr); - res = MUL_OP(gamma_vec, x_bar); + res0 = MUL_OP(gamma_vec, x_bar); #else /* USE_DEFAULT_GAMMA */ // gamma is equal to 1, no need to perform multiplications - res = x_bar; + res0 = x_bar; #endif /* USE_DEFAULT_GAMMA */ #ifndef USE_DEFAULT_BETA VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(beta.ptr + current_slice * VEC_SIZE * beta.stride_x)); + beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr); // beta is not zero, hence we need to perform the addition - res = ADD_OP(res, beta_vec); + res0 = ADD_OP(res0, beta_vec); #endif /* USE_DEFAULT_BETA */ - res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL); + res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL); - VSTORE(VEC_SIZE) - (res, 0, (__global DATA_TYPE *)out.ptr); + STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/ diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp index a2cabcfd1f..1c1df6c4eb 100644 --- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp @@ -80,16 +80,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, - ITensorInfo *mean, ITensorInfo *var, ITensorInfo *beta, ITensorInfo *gamma) +std::pair validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output) { - if(output != nullptr) - { - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, *input->clone()); - } - - const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0)); // Configure kernel window Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); @@ -107,25 +100,6 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen window_changed = update_window_and_padding(win, input_access); } - // Mean, var, gamma and beta get parallelized for the NHWC case as they follow the channel dimension, which is along the first axis - if(input->data_layout() == DataLayout::NHWC) - { - AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, mean_access, var_access); - - if(beta != nullptr) - { - AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, beta_access); - } - if(gamma != nullptr) - { - AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, gamma_access); - } - } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } @@ -162,12 +136,13 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info)); - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0)); // Set build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); @@ -191,13 +166,24 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_ } _kernel.setArg(idx++, _epsilon); + if(output != nullptr) + { + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), *input->info()->clone()); + } + // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(), - mean->info(), var->info(), - (beta != nullptr) ? beta->info() : nullptr, - (gamma != nullptr) ? gamma->info() : nullptr); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); + if(input->info()->data_layout() == DataLayout::NHWC) + { + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + } + else + { + auto win_config = validate_and_configure_window_nchw(input->info(), (_run_in_place) ? nullptr : output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + } _config_id = "batch_normalization_layer_"; _config_id += string_from_data_type(input->info()->data_type()); @@ -218,11 +204,12 @@ Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), - mean->clone().get(), var->clone().get(), - (beta != nullptr) ? beta->clone().get() : nullptr, - (gamma != nullptr) ? gamma->clone().get() : nullptr) - .first); + + if(input->data_layout() != DataLayout::NHWC) + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()) + .first); + } return Status{}; } diff --git a/tests/validation/CL/BatchNormalizationLayer.cpp b/tests/validation/CL/BatchNormalizationLayer.cpp index 3d1848bd11..e67f4cc199 100644 --- a/tests/validation/CL/BatchNormalizationLayer.cpp +++ b/tests/validation/CL/BatchNormalizationLayer.cpp @@ -62,6 +62,29 @@ const auto common_fusion_dataset = combine(combine(combine(framework::dataset::m framework::dataset::make("UseBeta", { false, true })), framework::dataset::make("UseGamma", { false, true })), framework::dataset::make("Epsilon", { 0.001f })); + +bool validate_zero_padding(TensorShape shape0, const TensorShape shape1, float epsilon, ActivationLayerInfo act_info, DataType dt, DataLayout data_layout) +{ + if(data_layout == DataLayout::NHWC) + { + permute(shape0, PermutationVector(2U, 0U, 1U)); + } + + // Create tensors + CLTensor src = create_tensor(shape0, dt, 1, QuantizationInfo(), data_layout); + CLTensor dst = create_tensor(shape0, dt, 1, QuantizationInfo(), data_layout); + CLTensor mean = create_tensor(shape1, dt, 1); + CLTensor var = create_tensor(shape1, dt, 1); + CLTensor beta = create_tensor(shape1, dt, 1); + CLTensor gamma = create_tensor(shape1, dt, 1); + + // Create and configure function + CLBatchNormalizationLayer norm; + norm.configure(&src, &dst, &mean, &var, &beta, &gamma, epsilon, act_info); + + return src.info()->padding().empty() && dst.info()->padding().empty() && mean.info()->padding().empty() && var.info()->padding().empty() && beta.info()->padding().empty() + && gamma.info()->padding().empty(); +} } // namespace TEST_SUITE(CL) @@ -118,6 +141,16 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( // clang-format on // *INDENT-ON* +DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(), + act_infos), + framework::dataset::make("DataType", { DataType::F32, DataType::F16 })), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + shape0, shape1, episilon, act_infos, data_type, data_layout) +{ + bool status = validate_zero_padding(shape0, shape1, episilon, act_infos, data_type, data_layout); + ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); +} + TEST_SUITE(Float) TEST_SUITE(FP32) FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(), -- cgit v1.2.1