diff options
Diffstat (limited to 'src/core/CL/kernels/CLSoftmaxLayerKernel.cpp')
-rw-r--r-- | src/core/CL/kernels/CLSoftmaxLayerKernel.cpp | 258 |
1 files changed, 11 insertions, 247 deletions
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp index 04a7639a83..447d6eeafa 100644 --- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp +++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -79,33 +79,14 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float return build_opts; } -// Arguments Validation - -Status validate_arguments_1DMax(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - - // Checks performed when output is configured - if(output->total_size() != 0) - { - // Softmax across the x dimension - TensorShape output_shape{ input->tensor_shape() }; - output_shape.set(0, 1); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - } - - return Status{}; -} - -Status validate_arguments_1DShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum) +Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max); + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type()); // Checks performed when output is configured @@ -141,33 +122,6 @@ Status validate_arguments_1DShiftExpSum(const ITensorInfo *input, const ITensorI return Status{}; } -Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max); - - // Checks performed when output is configured - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); - } - - // Checks performed when sum is configured - if(sum->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum); - } - - return Status{}; -} - Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32); @@ -200,58 +154,6 @@ Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *su // Window validation -std::pair<Status, Window> validate_and_configure_window_1DMax(ITensorInfo *input, ITensorInfo *output) -{ - TensorShape output_shape{ input->tensor_shape() }; - output_shape.set(0, 1); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); - - // The kernel loops over all elements in steps of 16 - const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16); - constexpr unsigned int num_elems_written_per_iteration = 1; - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - -std::pair<Status, Window> validate_and_configure_window_1DShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum) -{ - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type()); - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::S32 : input->data_type(); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*sum, max->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->fixed_point_position())); - auto_init_if_empty(*output, input->clone()->set_data_type(tmp_data_type)); - - // The kernel loops over all elements in steps of 16 - const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16); - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal max_access(max, 0, 1); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal sum_access(sum, 0, 1); - - bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access); - - output_access.set_valid_region(win, input->valid_region()); - sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape())); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - std::pair<Status, Window> validate_and_configure_window_1DMaxShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum) { // Output auto initialization if not yet initialized @@ -305,146 +207,6 @@ std::pair<Status, Window> validate_and_configure_window_1DNorm(ITensorInfo *inpu } // namespace -void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - TensorShape output_shape{ input->info()->tensor_shape() }; - output_shape.set(0, 1); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMax(input->info(), output->info())); - - _input = input; - _output = output; - - const DataType data_type = input->info()->data_type(); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option_if(is_data_type_fixed_point(data_type), - "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())); - build_opts.add_option_if(data_type == DataType::F16, "-DUSE_F16"); - // Tell the kernel that the width is not a multiple of 16 - build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, "-DNON_MULTIPLE_OF_16"); - - // Create kernel - std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "softmax_layer_max_quantized" : "softmax_layer_max"; - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); - - // Set fixed arguments - unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters - _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0)); - - // Configure kernel window - auto win_config = validate_and_configure_window_1DMax(input->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure(win_config.second); - - // Set config_id for enabling LWS tuning - _config_id = "softmax_layer_"; - _config_id += lower_string(string_from_data_type(data_type)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -Status CLLogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMax(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMax(input->clone().get(), output->clone().get()).first); - - return Status{}; -} - -CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel() - : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr) -{ -} - -void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output); - - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type()); - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::S32 : input->info()->data_type(); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*sum->info(), max->info()->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->info()->fixed_point_position())); - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(tmp_data_type)); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info())); - - _input = input; - _max = max; - _output = output; - _sum = sum; - - const DataType dt = input->info()->data_type(); - auto beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position()))); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); - build_opts.add_option_if(is_data_type_fixed_point(dt), - std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()))); - build_opts.add_option_if(dt == DataType::F16, std::string("-DUSE_F16")); - // Tell the kernel that the width is not a multiple of 16 - build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, std::string("-DNON_MULTIPLE_OF_16")); - build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), std::string("-DBETA=" + support::cpp11::to_string(beta_int))); - build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), std::string("-DBETA=" + float_to_string_with_full_precision(beta))); - build_opts.add_options_if(is_quantized_asymmetric, - prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options()); - - // Create kernel - std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_shift_exp_sum_quantized" : "softmax_layer_shift_exp_sum"; - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); - - // Set fixed arguments - unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters - _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0)); - - // Configure window - auto win_config = validate_and_configure_window_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure(win_config.second); -} - -Status CLLogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DShiftExpSum(input, max, output, sum)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first); - - return Status{}; -} - -void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = window_collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - // Set inputs - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _max, slice); - add_3D_tensor_argument(idx, _output, slice); - add_3D_tensor_argument(idx, _sum, slice); - enqueue(queue, *this, slice, _lws_hint); - } - while(window_collapsed.slide_window_slice_3D(slice)); -} - /**< Grid size (obtained through auto-tuning) */ const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64; /**< Vector size in the serial case (obtained through auto-tuning) */ @@ -485,9 +247,11 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16"); build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int)); build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options()); - _lws_hint = cl::NullRange; - std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_serial"); + _lws_hint = cl::NullRange; + std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_serial") : + std::string("softmax_layer_max_shift_exp_sum_serial"); ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size); unsigned int vector_size = std::get<1>(parallel_reduction_info); @@ -498,7 +262,7 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor // Configure parallel kernel if needed if(std::get<0>(parallel_reduction_info)) { - kernel_name = std::string("softmax_layer_max_shift_exp_sum_parallel"); + kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_parallel") : std::string("softmax_layer_max_shift_exp_sum_parallel"); bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0); build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size)); |