From 4a578b923ed000c67fe0bc1433f945aea634ca9c Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 25 Jun 2021 12:13:49 +0100 Subject: Port the ClGemmLowp kernels to the new API Ported kernels: - CLGEMMLowpMatrixMultiplyNativeKernel - CLGEMMLowpMatrixMultiplyReshapedKernel - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - CLGEMMLowpOffsetContributionKernel - CLGEMMLowpOffsetContributionOutputStageKernel - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - CLGEMMLowpQuantizeDownInt32ScaleKernel Signed-off-by: Georgios Pinitas Change-Id: I9d5a744d6a2dd2f2726fdfb291bad000b6970de2 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5870 Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../CLGEMMLowpMatrixMultiplyNativeKernel.cpp | 335 ------------ .../kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h | 108 ---- .../CLGEMMLowpMatrixMultiplyReshapedKernel.cpp | 298 ----------- .../CLGEMMLowpMatrixMultiplyReshapedKernel.h | 125 ----- ...GEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp | 574 --------------------- ...CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h | 155 ------ .../kernels/CLGEMMLowpOffsetContributionKernel.cpp | 215 -------- .../kernels/CLGEMMLowpOffsetContributionKernel.h | 116 ----- ...GEMMLowpOffsetContributionOutputStageKernel.cpp | 274 ---------- ...CLGEMMLowpOffsetContributionOutputStageKernel.h | 136 ----- ...owpQuantizeDownInt32ScaleByFixedPointKernel.cpp | 153 ------ ...MLowpQuantizeDownInt32ScaleByFixedPointKernel.h | 89 ---- ...GEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp | 160 ------ ...CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h | 101 ---- .../CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp | 158 ------ .../CLGEMMLowpQuantizeDownInt32ScaleKernel.h | 102 ---- src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp | 221 -------- src/core/CL/kernels/CLGEMMLowpReductionKernel.h | 176 ------- 18 files changed, 3496 deletions(-) delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h delete mode 100644 src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp delete mode 100644 src/core/CL/kernels/CLGEMMLowpReductionKernel.h (limited to 'src/core/CL/kernels') diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp deleted file mode 100644 index efba3d1d7a..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -using namespace misc::shape_calculator; - -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(input0->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); - - const int m = gemm_info.m(); - const int n = gemm_info.n(); - const int k = gemm_info.k(); - - ARM_COMPUTE_UNUSED(m); - ARM_COMPUTE_UNUSED(n); - ARM_COMPUTE_UNUSED(k); - - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast(k)); - ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast(n)); - ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast(k)); - if(gemm_info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast(m)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast(m)); - } - - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - - Window win{}; - bool window_changed = false; - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_output_as_3d = false; - } - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32)); - - TensorInfo tmp_info(*output); - - if(reinterpret_output_as_3d) - { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - // RHS matrix still needs padding on the X - AccessWindowStatic input1_access(input1, 0, 0, - ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), - input1->dimension(1)); - - window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(output->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false) -{ - _type = CLKernelType::GEMM; -} - -void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info); -} - -void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info)); - - _input0 = input0; - _input1 = input1; - _output = output; - _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - - // We still need padding on the X dimension for the RHS matrix - auto padding_info = get_padding_info({ input0, output }); - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, - // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1); - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % lhs_info.m0; - const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; - - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. - // NOTE: This might have implications on heuristics and performance - const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1))); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - std::string kernel_name("gemmlowp_mm_native"); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; - _config_id += "_"; - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k()); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.k0); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), - input1->clone().get(), - output->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed) - .first); - - return Status{}; -} - -void CLGEMMLowpMatrixMultiplyNativeKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - if(_input1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; - const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h deleted file mode 100644 index 125f0c6948..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */ -class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyNativeKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel - * - * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor info for the RHS matrix. Data type supported: same as @p input0 - * @param[in] output Output tensor info. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - bool _slide_matrix_b; - bool _reinterpret_input_as_3d; - bool _reinterpret_output_as_3d; - bool _use_dummy_work_items; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/ diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp deleted file mode 100644 index fd533fc310..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp +++ /dev/null @@ -1,298 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -using namespace misc::shape_calculator; - -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose); - ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); - - const int m = gemm_info.m(); - const int n = gemm_info.n(); - const int k = gemm_info.k(); - - TensorShape tensor_shape0{ input0->tensor_shape() }; - tensor_shape0.set(0, k); - tensor_shape0.set(1, m); - - TensorShape tensor_shape1{ input1->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0); - const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); - - const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); - - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32)); - - TensorInfo tmp_info(*output); - if(reinterpret_output_as_3d) - { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(output->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - return std::make_pair(Status{}, collapsed); -} -} // namespace - -CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false) -{ - _type = CLKernelType::GEMM; -} - -void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, lhs_info, rhs_info, gemm_info); -} - -void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info)); - - _input0 = input0; - _input1 = input1; - _output = output; - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - _k = gemm_info.k(); - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); - - auto padding_info = get_padding_info({ input0, input1, output }); - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), lhs_info, rhs_info, gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1); - - const unsigned int partial_store_m0 = internal_m % lhs_info.m0; - const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); - build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); - build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m())); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); - build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); - build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - - std::string kernel_name("gemmlowp_mm_reshaped_"); - kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; - kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; - _config_id += "_"; - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k()); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.k0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.v0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.h0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.interleave); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.interleave); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), - input1->clone().get(), - output->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed) - .first); - - return Status{}; -} - -void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - if(_input1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4; - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg(idx++, static_cast(_k)); - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h deleted file mode 100644 index 06a73f173d..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped - * - * @note The input matrices @p input0 and @p input1 must be reshaped through: - * - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel - * - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel - */ -class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyReshapedKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel - * - * @param[in] input0 Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] input1 Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[in] output Output tensor info. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 2,3,4,8,16 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - bool _slide_matrix_b; - bool _reinterpret_output_as_3d; - unsigned int _k; - bool _use_dummy_work_items; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/ diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp deleted file mode 100644 index 3424b6d264..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ /dev/null @@ -1,574 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include -#include - -using namespace arm_compute::misc::shape_calculator; - -namespace arm_compute -{ -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(input0->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - - const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; - const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; - const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); - - const int m = gemm_info.m; - const int n = gemm_info.n; - const int k = gemm_info.k; - - TensorShape tensor_shape1{ input1->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast(k)); - if(gemm_info.reinterpret_input_as_3d) - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast(m)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast(m)); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); - - const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info); - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(expected_output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - if(output_stage.type == GEMMLowpOutputStageType::NONE) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); - } - } - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0)); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), - "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); - - // Checks performed if the output stage needs to be fused - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - // If a_offset == 0, vector_sum_col can be a nullptr - if(gemm_info.a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_output_shape[0]); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(gemm_info.b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if mm result is a 3D reinterpretation - const bool reinterpret_as_3d = expected_output_shape.num_dimensions() > 1 && expected_output_shape.y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_output_shape[1] * expected_output_shape[2])); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_output_shape[1]); - - if(expected_output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - TensorShape collapsed_output_shape(expected_output_shape); - collapsed_output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_output_shape[output_batch_idx], - "vector_sum_row must have the same number of batches of output tensor"); - - if(gemm_info.a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type()); - } - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - - if(output_multipliers != nullptr && output_shifts != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) - { - ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_shifts->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != output_multipliers->dimension(0)); - } - } - } - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) -{ - const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_output_as_3d = false; - } - - // Output tensor auto initialization if not yet initialized - const TensorShape expected_output_shape = compute_mm_shape(*input0, *input1, gemm_info); - if(output_stage.type != GEMMLowpOutputStageType::NONE) - { - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(output_stage.output_data_type)); - } - else - { - auto_init_if_empty(*output, input0->clone()->set_tensor_shape(expected_output_shape).set_data_type(DataType::S32)); - } - - TensorInfo tmp_info(*output); - - if(reinterpret_output_as_3d) - { - // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(output->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0; - num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - if(gemm_info.a_offset != 0) - { - AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access); - } - // No access window needed for vector_sum_row - ARM_COMPUTE_UNUSED(vector_sum_row); - - if(bias != nullptr) - { - AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, bias_access); - } - - if(output_multipliers != nullptr && output_stage.is_quantized_per_channel) - { - AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x); - AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access); - } - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(output->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel() - : _input0(nullptr), - _input1(nullptr), - _output(nullptr), - _vector_sum_col(nullptr), - _vector_sum_row(nullptr), - _bias(nullptr), - _output_multipliers(nullptr), - _output_shifts(nullptr), - _slide_matrix_b(true), - _reinterpret_input_as_3d(false), - _reinterpret_output_as_3d(false), - _use_dummy_work_items(false), - _is_quantized_per_channel(false), - _fuse_output_stage(false) -{ - _type = CLKernelType::GEMM; -} - -void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, - const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts); -} - -void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, - const GEMMKernelInfo &gemm_info, - const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), - input1->info(), - output->info(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - output_multipliers != nullptr ? output_multipliers->info() : nullptr, - output_shifts != nullptr ? output_shifts->info() : nullptr)); - - auto padding_info = get_padding_info({ input0, input1, output, vector_sum_row }); - const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; - const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; - const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - const int32_t a_offset = gemm_info.a_offset; - const int32_t b_offset = gemm_info.b_offset; - - _input0 = input0; - _input1 = input1; - _output = output; - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _bias = bias; - _output_multipliers = output_multipliers; - _output_shifts = output_shifts; - _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - _is_quantized_per_channel = output_stage.is_quantized_per_channel; - - // In case both input and output have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); - _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), - input1->info(), - output->info(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - output_multipliers != nullptr ? output_multipliers->info() : nullptr, - output_shifts != nullptr ? output_shifts->info() : nullptr, - num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, - // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1); - - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. - // NOTE: This might have implications on heuristics and performance - const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % internal_m0; - const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); - build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type())); - - std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_"); - kernel_name += rhs_info.transpose ? "t" : "nt"; - - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - kernel_name += "_fused_output_stage_fixedpoint"; - _fuse_output_stage = true; - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0 && vector_sum_col != nullptr) - { - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); - } - // If b_offset == 0, vector_sum_row can be a nullptr - build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * input0->info()->dimension(0))); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); - build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); - build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); - - const int min = output_stage.gemmlowp_min_bound; - const int max = output_stage.gemmlowp_max_bound; - - PixelValue min_val{}; - PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(output->info()->data_type()); - build_opts.add_option_if(min != min_val.get(), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if(max != max_val.get(), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; - _config_id += "_"; - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.k0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.h0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.interleave); - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), - input1->clone().get(), - output->clone().get(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, - bias != nullptr ? bias->clone().get() : nullptr, - output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, - output_shifts != nullptr ? output_shifts->clone().get() : nullptr, - num_elements_processed) - .first); - - return Status{}; -} - -void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - if(_input1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; - const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - // Set window for vector_sum_col - Window win_vector_sum_col = slice; - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row = slice; - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window biases_slice = slice; - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - idx++; - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - idx++; - } - - if(_fuse_output_stage) - { - add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice); - } - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h deleted file mode 100644 index e79f6dfe05..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped - * - * @note The input matrix input1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel - * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported - */ -class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr, - const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr, - const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel - * - * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] input1 Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[in] output Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector info of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector info of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr, - const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr, - const ITensorInfo *output_shifts = nullptr); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - const ICLTensor *_vector_sum_col; - const ICLTensor *_vector_sum_row; - const ICLTensor *_bias; - const ICLTensor *_output_multipliers; - const ICLTensor *_output_shifts; - bool _slide_matrix_b; - bool _reinterpret_input_as_3d; - bool _reinterpret_output_as_3d; - bool _use_dummy_work_items; - bool _is_quantized_per_channel; - bool _fuse_output_stage; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */ \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp deleted file mode 100644 index b97bb84e59..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); - } - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); - - TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], - "mm_result tensor must have the same number of batches of output tensor"); - - if(a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - return Status{}; -} -} // namespace - -CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() - : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr) -{ - _type = CLKernelType::ELEMENTWISE; -} - -void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, - int32_t b_offset) -{ - configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset); -} - -void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, - const ICLTensor *bias, - int32_t k, int32_t a_offset, - int32_t b_offset) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - a_offset, b_offset)); // NOLINT - - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias }); - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _bias = bias; - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0)); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration)); - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); - } - // If b_offset == 0, vector_sum_row can be a nullptr - build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2))); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - std::string kernel_name("gemmlowp_offset_contribution"); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name + "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); - return Status{}; -} - -void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Set window for vector_sum_col - Window win_vector_sum_col = slice; - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row = slice; - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window biases_slice = slice; - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _mm_result, slice); - add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h deleted file mode 100644 index f8705595a0..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), - * and adds to it the offset contribution of matrix A and matrix B in-place. - * - * The final result is: - * - * mm_result[i][k] = mm_result[i][k] + - * (vector_sum_col[k] * a_offset) + - * (vector_sum_row[i] * b_offset) + - * (a_offset * b_offset * k) - * - */ -class CLGEMMLowpOffsetContributionKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpOffsetContributionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - */ - void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - */ - void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, - int32_t b_offset); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel - * - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_vector_sum_col; - const ICLTensor *_vector_sum_row; - ICLTensor *_mm_result; - const ICLTensor *_bias; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp deleted file mode 100644 index eaa832fbaa..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, - int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); - } - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) - { - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0)); - } - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); - - TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], - "mm_result tensor must have the same number of batches of output tensor"); - - if(a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE); - // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != output->data_type()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); - } - - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect"); - - return Status{}; -} -} // namespace - -CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel() - : _mm_result(nullptr), - _vector_sum_col(nullptr), - _vector_sum_row(nullptr), - _bias(nullptr), - _output(nullptr), - _output_multipliers(nullptr), - _output_shifts(nullptr), - _is_quantized_per_channel(false) -{ - _type = CLKernelType::ELEMENTWISE; -} - -void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, output, k, a_offset, b_offset, output_stage, output_multipliers, output_shifts); -} - -void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, - const ICLTensor *bias, ICLTensor *output, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ICLTensor *output_multipliers, const ICLTensor *output_shifts) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output, output_multipliers, output_shifts); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(), - vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, - bias != nullptr ? bias->info() : nullptr, - output->info(), - a_offset, b_offset, output_stage, - output_multipliers->info(), output_shifts->info())); // NOLINT - - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, output, output_multipliers, output_shifts }); - - const int min = output_stage.gemmlowp_min_bound; - const int max = output_stage.gemmlowp_max_bound; - - _vector_sum_col = vector_sum_col; - _vector_sum_row = vector_sum_row; - _mm_result = mm_result; - _bias = bias; - _output = output; - _output_multipliers = output_multipliers; - _output_shifts = output_shifts; - _is_quantized_per_channel = output_stage.is_quantized_per_channel; - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - - // Auto initialize the output - auto_init_if_empty(*output->info(), mm_result->info()->clone()->set_data_type(output_stage.output_data_type)); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0)); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration)); - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); - } - // If b_offset == 0, vector_sum_row can be a nullptr - build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2))); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); - build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); - build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - - PixelValue min_val{}; - PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(output->info()->data_type()); - build_opts.add_option_if((min > min_val.get()), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < max_val.get()), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - - std::string kernel_name("gemmlowp_offset_contribution"); - kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name + "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->info()->dimension(2)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); - return Status{}; -} - -void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Set window for vector_sum_col - Window win_vector_sum_col = slice; - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row = slice; - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window biases_slice = slice; - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _mm_result, slice); - add_2D_tensor_argument_if((_vector_sum_col != nullptr), idx, _vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((_vector_sum_row != nullptr), idx, _vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((_bias != nullptr), idx, _bias, biases_slice); - add_3D_tensor_argument(idx, _output, slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_multipliers, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, _output_shifts, biases_slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h deleted file mode 100644 index 15f54d17a5..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage. - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution - * of matrix A and matrix B and performs the output stage defined by the output_stage argument - * - * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo. - */ -class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpOffsetContributionOutputStageKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - */ - void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset, - const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, - int32_t k, - int32_t a_offset, int32_t b_offset, - const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel - * - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset, - int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_mm_result; - const ICLTensor *_vector_sum_col; - const ICLTensor *_vector_sum_row; - const ICLTensor *_bias; - ICLTensor *_output; - const ICLTensor *_output_multipliers; - const ICLTensor *_output_shifts; - bool _is_quantized_per_channel; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp deleted file mode 100644 index 03bdf3f836..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} -} // namespace - -CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info)); - - return Status{}; -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - const GEMMLowpOutputStageInfo *info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info)); - - auto padding_info = get_padding_info({ input, bias, output }); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type)); - - _input = input; - _bias = bias; - _output = output; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); - - // Set the arguments to pass at compile time - auto min = info->gemmlowp_min_bound; - auto max = info->gemmlowp_max_bound; - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset)); - build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier)); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - auto win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - // Create input window - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Setup bias slice - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h deleted file mode 100644 index 8653102cd8..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16 - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Round to nearest division by a power-of-two using result_shift - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16. - */ -class CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16. - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QSYMM8/QASYMM8_SIGNED/QSYMM16. - * @param[in] info Output stage info. Used to pass the quantized output data type - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_bias; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp deleted file mode 100644 index abef484c1e..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) - || info->gemmlowp_min_bound > info->gemmlowp_max_bound); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} -} // namespace - -class Coordinates; -CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info)); - - return Status{}; -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - const GEMMLowpOutputStageInfo *info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info); -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - const GEMMLowpOutputStageInfo *info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info)); - - auto padding_info = get_padding_info({ input, bias, output }); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type)); - - _input = input; - _bias = bias; - _output = output; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); - - auto min = info->gemmlowp_min_bound; - auto max = info->gemmlowp_max_bound; - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier)); - build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - // Create input window - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Setup bias slice - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h deleted file mode 100644 index 0a8d5e1942..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -// Forward declarations -class ICLTensor; - -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Requantize - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to - * - to the [0..255] range and cast to QASYMM8. - * - to the [-128..127] range and cast to QASYMM8_SIGNED. - */ -class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_bias; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */ diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp deleted file mode 100644 index faf86c769e..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != output_stage->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - } - - return Status{}; -} -} //namespace - -CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel() - : _input(nullptr), _bias(nullptr), _output(nullptr) -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, output_stage)); - - return Status{}; -} - -void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage); -} - -void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, - const GEMMLowpOutputStageInfo *output_stage) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), - (bias != nullptr) ? bias->info() : nullptr, - output->info(), - output_stage)); - - auto padding_info = get_padding_info({ input, bias, output }); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type)); - - _input = input; - _bias = bias; - _output = output; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0)); - - // Set the arguments to pass at compile time - auto min = output_stage->gemmlowp_min_bound; - auto max = output_stage->gemmlowp_max_bound; - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset)); - build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier)); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift)); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), - "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), - "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(_bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, _bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx1, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h deleted file mode 100644 index abdf33ea43..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Add bias to final result if bias tensor is not a nullptr - * -# Shift the int32 accumulator by result_shift - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values: - * -# -to the [0..255] range and cast to QASYMM8. - * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. - * - */ -class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGEMMLowpQuantizeDownInt32ScaleKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - */ - void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_bias; - ICLTensor *_output; -}; -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */ \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp deleted file mode 100644 index 55ae3ea6ca..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8); - - if(output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); - } - return Status{}; -} - -Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - - if(output->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); - } - return Status{}; -} -} // namespace - -ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel() - : _input(), _output() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), mtx_a, vector_sum_row, info); -} - -void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info())); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*vector_sum_row->info(), TensorShape(mtx_a->info()->dimension(1)), 1, DataType::S32); - - auto padding_info = get_padding_info({ mtx_a, vector_sum_row }); - - _input = mtx_a; - _output = vector_sum_row; - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->info()->data_type())); - build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); - - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); - - std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : ""); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - // This kernel does not need padding - Window win = calculate_max_window(*vector_sum_row->info(), Steps()); - ICLKernel::configure_internal(win); - - _config_id = kernel_name; - _config_id += "_"; - _config_id += support::cpp11::to_string(_input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(_input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(_input->info()->dimension(2)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); - - return Status{}; -} - -void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); - Window slice_in = collapsed.first_slice_window_2D(); - Window slice_out = collapsed.first_slice_window_2D(); - - // Setup input slice. Its dimensions are increased in the cl kernel. - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); -} - -void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), mtx_b, vector_sum_col, info); -} - -void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info())); - - _input = mtx_b; - _output = vector_sum_col; - - // Output auto initialization if not yet initialized - auto_init_if_empty(*_output->info(), TensorShape(mtx_b->info()->dimension(0)), 1, DataType::S32); - - auto padding_info = get_padding_info({ mtx_b, vector_sum_col }); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->info()->dimension(0)); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->info()->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0))); - build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->info()->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->info()->data_type())); - build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); - - return Status{}; -} - -void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY); - - Window slice_out = collapsed.first_slice_window_2D(); - Window slice_in = slice_out; - - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h deleted file mode 100644 index 237d8099b7..0000000000 --- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; -struct GEMMLowpReductionKernelInfo; - -/** Common interface for all OpenCL reduction kernels */ -class ICLGEMMLowpReductionKernel : public ICLKernel -{ -public: - /** Constructor */ - ICLGEMMLowpReductionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete; - /** Allow instances of this class to be moved */ - ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default; - /** Allow instances of this class to be moved */ - ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default; - - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0; - -protected: - const ICLTensor *_input; - ICLTensor *_output; -}; - -/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel - * - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; -}; - -/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel - * - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */ -- cgit v1.2.1