diff options
author | Michele Di Giorgio <michele.digiorgio@arm.com> | 2020-10-12 13:27:57 +0100 |
---|---|---|
committer | Michele Di Giorgio <michele.digiorgio@arm.com> | 2020-10-14 16:21:18 +0000 |
commit | ba14c92054ec9d2b5827fa85f85733e5cf496bcf (patch) | |
tree | 4ef8514285326e3027d4657b29251fe58e076a22 /src | |
parent | cf9e29e3bd2fcd772c156c7866425335bfdbde6a (diff) | |
download | ComputeLibrary-ba14c92054ec9d2b5827fa85f85733e5cf496bcf.tar.gz |
COMPMID-3829: Create CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel and remove padding from related OpenCL kernels
Change-Id: I0b0be8fcccf511c7214e83ba6aa8d0e901bc4f3c
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4146
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/core/CL/cl_kernels/gemmlowp.cl | 24 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp (renamed from src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp) | 103 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp | 182 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp | 183 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp | 101 |
5 files changed, 94 insertions, 499 deletions
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index b4ac00535e..8405a7beb7 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -1986,6 +1986,7 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. * These values can be used to implement "rectified linear unit" activation functions + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE * * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -2015,7 +2016,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO TENSOR3D_DECLARATION(dst)) { // Compute source and destination addresses - int x = get_global_id(0) * 4; + int x = max((int)(get_global_id(0) * 4 - (4 - VEC_SIZE_LEFTOVER) % 4), 0); int y = get_global_id(1); int z = get_global_id(2); @@ -2044,17 +2045,17 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO input_values += (int4)RESULT_OFFSET_AFTER_SHIFT; VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) - res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); #if defined(MIN_BOUND) - res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); + res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); #endif // defined(MIN_BOUND) #if defined(MAX_BOUND) - res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); + res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); #endif // defined(MAX_BOUND) // Store the result - vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); + STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, 4, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } #endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) @@ -2077,6 +2078,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. * These values can be used to implement "rectified linear unit" activation functions + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE * * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -2106,13 +2108,13 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE TENSOR3D_DECLARATION(dst)) { // Compute source and destination addresses - int x = get_global_id(0) * 4; + int x = max((int)(get_global_id(0) * 4 - (4 - VEC_SIZE_LEFTOVER) % 4), 0); int y = get_global_id(1); int z = get_global_id(2); __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z; int4 input_values = vload4(0, (__global int *)src_addr); @@ -2131,17 +2133,17 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); #endif // RESULT_SHIFT < 0 - short4 res = convert_short4_sat(input_values); + short4 res0 = convert_short4_sat(input_values); #if defined(MIN_BOUND) - res = max(res, (short4)MIN_BOUND); + res0 = max(res0, (short4)MIN_BOUND); #endif // defined(MIN_BOUND) #if defined(MAX_BOUND) - res = min(res, (short4)MAX_BOUND); + res0 = min(res0, (short4)MAX_BOUND); #endif // defined(MAX_BOUND) // Store the result - vstore4(res, 0, (__global short *)dst_addr); + STORE_VECTOR_SELECT(res, short, dst_addr, 4, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) } #endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp index 92335747be..ff4136c5f0 100644 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,9 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" -#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Error.h" @@ -33,6 +32,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "support/StringSupport.h" @@ -40,11 +40,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info) { + ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(min > max); // Check biases if exist if(bias != nullptr) @@ -56,103 +55,69 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con if(output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); } return Status{}; } - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) -{ - constexpr unsigned int num_elems_processed_per_iteration = 4; - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8)); - - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access); - - if(output->total_size() != 0) - { - Window win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win_out, output_result_access); - output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - } - - if(bias != nullptr) - { - AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]); - window_changed = window_changed || update_window_and_padding(win, bias_access); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} } // namespace -CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() +CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel() : _input(nullptr), _bias(nullptr), _output(nullptr) { } -Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) +Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, + const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (bias != nullptr) ? bias->clone().get() : nullptr, - output->clone().get()) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info)); return Status{}; } -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) +void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, + const GEMMLowpOutputStageInfo *info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max)); - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info)); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type)); _input = input; _bias = bias; _output = output; + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); + // Set the arguments to pass at compile time + auto min = info->gemmlowp_min_bound; + auto max = info->gemmlowp_max_bound; CLBuildOptions build_opts; - build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(result_offset_after_shift)); - build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier)); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset)); + build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier)); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift)); build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), + "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), + "-DMAX_BOUND=" + support::cpp11::to_string(max)); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_fixedpoint", build_opts.options()); + const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - ICLKernel::configure_internal(win_config.second); + // Configure kernel window + auto win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); } -void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue) +void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp deleted file mode 100644 index c98f5bf3eb..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(min > max); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) -{ - constexpr unsigned int num_elems_processed_per_iteration = 4; - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QSYMM16)); - - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access); - - if(output->total_size() != 0) - { - Window win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win_out, output_result_access); - - output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - } - - if(bias != nullptr) - { - AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]); - window_changed = window_changed || update_window_and_padding(win, bias_access); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) -{ -} - -Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (bias != nullptr) ? bias->clone().get() : nullptr, - output->clone().get()) - .first); - - return Status{}; -} - -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, - int min, int max) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), - min, max)); - - _input = input; - _bias = bias; - _output = output; - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier)); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift)); - build_opts.add_option_if((min > -32768), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < 32767), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", build_opts.options()); - - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); -} - -void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - // Create input window - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Setup bias slice - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp deleted file mode 100644 index fa78410440..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" - -#include "arm_compute/core/AccessWindowStatic.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(min > max); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) -{ - constexpr unsigned int num_elems_processed_per_iteration = 4; - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8_SIGNED)); - - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access); - - if(output->total_size() != 0) - { - Window win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win_out, output_result_access); - output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - } - - if(bias != nullptr) - { - AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]); - window_changed = window_changed || update_window_and_padding(win, bias_access); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} -} // namespace - -CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) -{ -} - -Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - int min, int max) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (bias != nullptr) ? bias->clone().get() : nullptr, - output->clone().get()) - .first); - - return Status{}; -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min, int max) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max)); - // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - - _input = input; - _bias = bias; - _output = output; - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(result_offset_after_shift)); - build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier)); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option_if((min > -128), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < 127), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_fixedpoint", build_opts.options()); - - ICLKernel::configure_internal(win_config.second); -} - -void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - // Create input window - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Setup bias slice - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp index a499e1858d..28f397fd8b 100644 --- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -24,11 +24,9 @@ #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" #include "support/MemorySupport.h" namespace arm_compute @@ -44,39 +42,59 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLComp int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min, int max) { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); + GEMMLowpOutputStageInfo info{}; + info.gemmlowp_multiplier = result_fixedpoint_multiplier; + info.gemmlowp_shift = result_shift; + info.gemmlowp_offset = result_offset_after_shift; + info.gemmlowp_min_bound = min; + info.gemmlowp_max_bound = max; + info.output_data_type = DataType::QASYMM8; + auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); + k->configure(compile_context, input, bias, output, &info); _kernel = std::move(k); } Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) { - return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max); + GEMMLowpOutputStageInfo info{}; + info.gemmlowp_min_bound = min; + info.gemmlowp_max_bound = max; + info.output_data_type = DataType::QASYMM8; + return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); } void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min, int max) { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); - _kernel = std::move(k); + configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); } void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min, int max) { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); + GEMMLowpOutputStageInfo info{}; + info.gemmlowp_multiplier = result_fixedpoint_multiplier; + info.gemmlowp_shift = result_shift; + info.gemmlowp_offset = result_offset_after_shift; + info.gemmlowp_min_bound = min; + info.gemmlowp_max_bound = max; + info.output_data_type = DataType::QASYMM8_SIGNED; + auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); + k->configure(compile_context, input, bias, output, &info); _kernel = std::move(k); } Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) { - return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max); + GEMMLowpOutputStageInfo info{}; + info.gemmlowp_min_bound = min; + info.gemmlowp_max_bound = max; + info.output_data_type = DataType::QASYMM8_SIGNED; + return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); } void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, @@ -90,15 +108,25 @@ void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLComp int result_fixedpoint_multiplier, int result_shift, int min, int max) { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, min, max); + GEMMLowpOutputStageInfo info{}; + info.gemmlowp_multiplier = result_fixedpoint_multiplier; + info.gemmlowp_shift = result_shift; + info.gemmlowp_min_bound = min; + info.gemmlowp_max_bound = max; + info.output_data_type = DataType::QSYMM16; + auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); + k->configure(compile_context, input, bias, output, &info); _kernel = std::move(k); } Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) { - return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max); + GEMMLowpOutputStageInfo info{}; + info.gemmlowp_min_bound = min; + info.gemmlowp_max_bound = max; + info.output_data_type = DataType::QSYMM16; + return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); } void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info) @@ -114,32 +142,9 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { - switch(info.output_data_type) - { - case DataType::QASYMM8: - { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QASYMM8_SIGNED: - { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QSYMM16: - { - auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported output data type."); - } + auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>(); + k->configure(compile_context, input, bias, output, &info); + _kernel = std::move(k); break; } case GEMMLowpOutputStageType::QUANTIZE_DOWN: @@ -169,19 +174,7 @@ Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorIn switch(info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - switch(output->data_type()) - { - case DataType::QASYMM8: - return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QASYMM8_SIGNED: - return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QSYMM16: - return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); - } - } + return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info); case GEMMLowpOutputStageType::QUANTIZE_DOWN: return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info); case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: |