COMPMID-905 Optimize CLSoftmaxLayer for QASYMM8

Change-Id: I3512d67b8a72b17db1381842ca42780e39cc511c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120605 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
author: Giorgio Arena <giorgio.arena@arm.com> 2018-02-15 13:37:40 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:47:18 +0000
commit: 4402cb93dffbd038f0e442d2f424a6927e55bc92 (patch)
tree: 9b23b4f1b03e08a4e17c6b11f506abe1953b45bc /src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
parent: a086a0a4ddf1bbe17d532cc30be981b51034311e (diff)
download: ComputeLibrary-4402cb93dffbd038f0e442d2f424a6927e55bc92.tar.gz
1 files changed, 11 insertions, 247 deletions
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 04a7639a83..447d6eeafa 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,33 +79,14 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float
     return build_opts;
 }
 
-// Arguments Validation
-
-Status validate_arguments_1DMax(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        // Softmax across the x dimension
-        TensorShape output_shape{ input->tensor_shape() };
-        output_shape.set(0, 1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-Status validate_arguments_1DShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
+Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
+
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
 
     // Checks performed when output is configured
@@ -141,33 +122,6 @@ Status validate_arguments_1DShiftExpSum(const ITensorInfo *input, const ITensorI
     return Status{};
 }
 
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    }
-
-    // Checks performed when sum is configured
-    if(sum->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
-    }
-
-    return Status{};
-}
-
 Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
@@ -200,58 +154,6 @@ Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *su
 
 // Window validation
 
-std::pair<Status, Window> validate_and_configure_window_1DMax(ITensorInfo *input, ITensorInfo *output)
-{
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(0, 1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-
-    // The kernel loops over all elements in steps of 16
-    const unsigned int     num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
-    constexpr unsigned int num_elems_written_per_iteration   = 1;
-
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_1DShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
-{
-    const bool     is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
-    const DataType tmp_data_type           = is_quantized_asymmetric ? DataType::S32 : input->data_type();
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum, max->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->fixed_point_position()));
-    auto_init_if_empty(*output, input->clone()->set_data_type(tmp_data_type));
-
-    // The kernel loops over all elements in steps of 16
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
-
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal max_access(max, 0, 1);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal sum_access(sum, 0, 1);
-
-    bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-
-    output_access.set_valid_region(win, input->valid_region());
-    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
 std::pair<Status, Window> validate_and_configure_window_1DMaxShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
 {
     // Output auto initialization if not yet initialized
@@ -305,146 +207,6 @@ std::pair<Status, Window> validate_and_configure_window_1DNorm(ITensorInfo *inpu
 
 } // namespace
 
-void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.set(0, 1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMax(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    const DataType data_type = input->info()->data_type();
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option_if(is_data_type_fixed_point(data_type),
-                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    build_opts.add_option_if(data_type == DataType::F16, "-DUSE_F16");
-    // Tell the kernel that the width is not a multiple of 16
-    build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, "-DNON_MULTIPLE_OF_16");
-
-    // Create kernel
-    std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "softmax_layer_max_quantized" : "softmax_layer_max";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-    // Set fixed arguments
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_1DMax(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "softmax_layer_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLLogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMax(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMax(input->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
-    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
-{
-}
-
-void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
-
-    const bool     is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
-    const DataType tmp_data_type           = is_quantized_asymmetric ? DataType::S32 : input->info()->data_type();
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum->info(), max->info()->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->info()->fixed_point_position()));
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(tmp_data_type));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
-
-    _input  = input;
-    _max    = max;
-    _output = output;
-    _sum    = sum;
-
-    const DataType dt       = input->info()->data_type();
-    auto           beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
-    build_opts.add_option_if(is_data_type_fixed_point(dt),
-                             std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
-    build_opts.add_option_if(dt == DataType::F16, std::string("-DUSE_F16"));
-    // Tell the kernel that the width is not a multiple of 16
-    build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, std::string("-DNON_MULTIPLE_OF_16"));
-    build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), std::string("-DBETA=" + support::cpp11::to_string(beta_int)));
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), std::string("-DBETA=" + float_to_string_with_full_precision(beta)));
-    build_opts.add_options_if(is_quantized_asymmetric,
-                              prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
-
-    // Create kernel
-    std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_shift_exp_sum_quantized" : "softmax_layer_shift_exp_sum";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-    // Set fixed arguments
-    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
-
-    // Configure window
-    auto win_config = validate_and_configure_window_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure(win_config.second);
-}
-
-Status CLLogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DShiftExpSum(input, max, output, sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
-
-    return Status{};
-}
-
-void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _max, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        add_3D_tensor_argument(idx, _sum, slice);
-        enqueue(queue, *this, slice, _lws_hint);
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-
 /**< Grid size (obtained through auto-tuning) */
 const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
 /**< Vector size in the serial case (obtained through auto-tuning) */
@@ -485,9 +247,11 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor
     build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
     build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int));
     build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
 
-    _lws_hint                                     = cl::NullRange;
-    std::string           kernel_name             = std::string("softmax_layer_max_shift_exp_sum_serial");
+    _lws_hint               = cl::NullRange;
+    std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_serial") :
+                              std::string("softmax_layer_max_shift_exp_sum_serial");
     ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
     unsigned int          vector_size             = std::get<1>(parallel_reduction_info);
 
@@ -498,7 +262,7 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor
     // Configure parallel kernel if needed
     if(std::get<0>(parallel_reduction_info))
     {
-        kernel_name            = std::string("softmax_layer_max_shift_exp_sum_parallel");
+        kernel_name            = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_parallel") : std::string("softmax_layer_max_shift_exp_sum_parallel");
         bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
         build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
author	Giorgio Arena <giorgio.arena@arm.com>	2018-02-15 13:37:40 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:47:18 +0000
commit	4402cb93dffbd038f0e442d2f424a6927e55bc92 (patch)
tree	9b23b4f1b03e08a4e17c6b11f506abe1953b45bc /src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
parent	a086a0a4ddf1bbe17d532cc30be981b51034311e (diff)
download	ComputeLibrary-4402cb93dffbd038f0e442d2f424a6927e55bc92.tar.gz