aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2018-02-15 13:37:40 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:47:18 +0000
commit4402cb93dffbd038f0e442d2f424a6927e55bc92 (patch)
tree9b23b4f1b03e08a4e17c6b11f506abe1953b45bc /src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
parenta086a0a4ddf1bbe17d532cc30be981b51034311e (diff)
downloadComputeLibrary-4402cb93dffbd038f0e442d2f424a6927e55bc92.tar.gz
COMPMID-905 Optimize CLSoftmaxLayer for QASYMM8
Change-Id: I3512d67b8a72b17db1381842ca42780e39cc511c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120605 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLSoftmaxLayerKernel.cpp')
-rw-r--r--src/core/CL/kernels/CLSoftmaxLayerKernel.cpp258
1 files changed, 11 insertions, 247 deletions
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 04a7639a83..447d6eeafa 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -79,33 +79,14 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float
return build_opts;
}
-// Arguments Validation
-
-Status validate_arguments_1DMax(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- // Softmax across the x dimension
- TensorShape output_shape{ input->tensor_shape() };
- output_shape.set(0, 1);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
- }
-
- return Status{};
-}
-
-Status validate_arguments_1DShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
+Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
+
const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
// Checks performed when output is configured
@@ -141,33 +122,6 @@ Status validate_arguments_1DShiftExpSum(const ITensorInfo *input, const ITensorI
return Status{};
}
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
- }
-
- // Checks performed when sum is configured
- if(sum->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
- }
-
- return Status{};
-}
-
Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
@@ -200,58 +154,6 @@ Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *su
// Window validation
-std::pair<Status, Window> validate_and_configure_window_1DMax(ITensorInfo *input, ITensorInfo *output)
-{
- TensorShape output_shape{ input->tensor_shape() };
- output_shape.set(0, 1);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-
- // The kernel loops over all elements in steps of 16
- const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
- constexpr unsigned int num_elems_written_per_iteration = 1;
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_1DShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
-{
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
- const DataType tmp_data_type = is_quantized_asymmetric ? DataType::S32 : input->data_type();
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*sum, max->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->fixed_point_position()));
- auto_init_if_empty(*output, input->clone()->set_data_type(tmp_data_type));
-
- // The kernel loops over all elements in steps of 16
- const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
-
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal max_access(max, 0, 1);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal sum_access(sum, 0, 1);
-
- bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-
- output_access.set_valid_region(win, input->valid_region());
- sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
std::pair<Status, Window> validate_and_configure_window_1DMaxShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
{
// Output auto initialization if not yet initialized
@@ -305,146 +207,6 @@ std::pair<Status, Window> validate_and_configure_window_1DNorm(ITensorInfo *inpu
} // namespace
-void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- TensorShape output_shape{ input->info()->tensor_shape() };
- output_shape.set(0, 1);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMax(input->info(), output->info()));
-
- _input = input;
- _output = output;
-
- const DataType data_type = input->info()->data_type();
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
- build_opts.add_option_if(is_data_type_fixed_point(data_type),
- "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
- build_opts.add_option_if(data_type == DataType::F16, "-DUSE_F16");
- // Tell the kernel that the width is not a multiple of 16
- build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, "-DNON_MULTIPLE_OF_16");
-
- // Create kernel
- std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "softmax_layer_max_quantized" : "softmax_layer_max";
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
- // Set fixed arguments
- unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window_1DMax(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure(win_config.second);
-
- // Set config_id for enabling LWS tuning
- _config_id = "softmax_layer_";
- _config_id += lower_string(string_from_data_type(data_type));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(0));
- _config_id += "_";
- _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLLogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMax(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMax(input->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
- : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
-{
-}
-
-void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
-
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
- const DataType tmp_data_type = is_quantized_asymmetric ? DataType::S32 : input->info()->data_type();
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*sum->info(), max->info()->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->info()->fixed_point_position()));
- auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(tmp_data_type));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
-
- _input = input;
- _max = max;
- _output = output;
- _sum = sum;
-
- const DataType dt = input->info()->data_type();
- auto beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
-
- // Set build options
- CLBuildOptions build_opts;
- build_opts.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
- build_opts.add_option_if(is_data_type_fixed_point(dt),
- std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
- build_opts.add_option_if(dt == DataType::F16, std::string("-DUSE_F16"));
- // Tell the kernel that the width is not a multiple of 16
- build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, std::string("-DNON_MULTIPLE_OF_16"));
- build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), std::string("-DBETA=" + support::cpp11::to_string(beta_int)));
- build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), std::string("-DBETA=" + float_to_string_with_full_precision(beta)));
- build_opts.add_options_if(is_quantized_asymmetric,
- prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
-
- // Create kernel
- std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_shift_exp_sum_quantized" : "softmax_layer_shift_exp_sum";
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
- // Set fixed arguments
- unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
- _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
-
- // Configure window
- auto win_config = validate_and_configure_window_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure(win_config.second);
-}
-
-Status CLLogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DShiftExpSum(input, max, output, sum));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
-
- return Status{};
-}
-
-void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- Window slice = window_collapsed.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- // Set inputs
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _max, slice);
- add_3D_tensor_argument(idx, _output, slice);
- add_3D_tensor_argument(idx, _sum, slice);
- enqueue(queue, *this, slice, _lws_hint);
- }
- while(window_collapsed.slide_window_slice_3D(slice));
-}
-
/**< Grid size (obtained through auto-tuning) */
const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
/**< Vector size in the serial case (obtained through auto-tuning) */
@@ -485,9 +247,11 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor
build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int));
build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+ build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
- _lws_hint = cl::NullRange;
- std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_serial");
+ _lws_hint = cl::NullRange;
+ std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_serial") :
+ std::string("softmax_layer_max_shift_exp_sum_serial");
ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
unsigned int vector_size = std::get<1>(parallel_reduction_info);
@@ -498,7 +262,7 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor
// Configure parallel kernel if needed
if(std::get<0>(parallel_reduction_info))
{
- kernel_name = std::string("softmax_layer_max_shift_exp_sum_parallel");
+ kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_parallel") : std::string("softmax_layer_max_shift_exp_sum_parallel");
bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));