From eb65f6da695ac0d3e495817145cceb1c4de4f048 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 15 Apr 2020 11:42:15 +0100 Subject: COMPMID-3304: Update OpenCL GEMM heuristic for Int8 Change-Id: I6b7ff678d8d0437a1639db2ff602ea1cdb155464 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3056 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins --- .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h | 101 --------------------- .../kernels/CLGEMMLowpOffsetContributionKernel.h | 8 +- ...CLGEMMLowpOffsetContributionOutputStageKernel.h | 8 +- ...CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h | 2 +- .../CLGEMMLowpQuantizeDownInt32ScaleKernel.h | 2 +- ...antizeDownInt32ToInt16ScaleByFixedPointKernel.h | 2 +- ...uantizeDownInt32ToInt8ScaleByFixedPointKernel.h | 2 +- ...antizeDownInt32ToUint8ScaleByFixedPointKernel.h | 2 +- 8 files changed, 13 insertions(+), 114 deletions(-) delete mode 100644 arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h (limited to 'arm_compute/core/CL/kernels') diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h deleted file mode 100644 index e926f5ed36..0000000000 --- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2017-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H -#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H - -#include "arm_compute/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to multiply matrices - * - * @note This kernel should be used ONLY for Midgard architectures - * - * This kernel performs the following computation: - * - * -# Convert a values from int8 to int32 - * -# Convert b values from int8 to int32 - * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 - * - */ -class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel -{ -public: - /** Default Constructor */ - CLGEMMLowpMatrixMultiplyKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyKernel(const CLGEMMLowpMatrixMultiplyKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGEMMLowpMatrixMultiplyKernel &operator=(const CLGEMMLowpMatrixMultiplyKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyKernel(CLGEMMLowpMatrixMultiplyKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @note This kernel should be used ONLY for Midgard architectures - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8 - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo()); - /** Initialise the kernel's input and output. - * - * @note This kernel should be used ONLY for Midgard architectures - * - * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8 - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyKernel - * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8 - * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0 - * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices - * - * @return a status - */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo()); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input0; - const ICLTensor *_input1; - ICLTensor *_output; - bool _slide_matrix_b; - bool _reinterpret_input_as_3d; - bool _reinterpret_output_as_3d; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H*/ diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h index d7266b2805..f9ec558d85 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h @@ -30,9 +30,9 @@ namespace arm_compute { class ICLTensor; -/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place +/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place * - * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), * and adds to it the offset contribution of matrix A and matrix B in-place. * * The final result is: @@ -58,7 +58,7 @@ public: CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in, out] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. @@ -73,7 +73,7 @@ public: /** Initialise the kernel's input and output. * * @param[in] compile_context The compile context to be used. - * @param[in, out] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h index 02ed20e5af..032539b699 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h @@ -30,9 +30,9 @@ namespace arm_compute { class ICLTensor; -/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and perform the output stage. +/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage. * - * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution * of matrix A and matrix B and performs the output stage defined by the output_stage argument * * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo. @@ -52,7 +52,7 @@ public: CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. @@ -74,7 +74,7 @@ public: /** Initialise the kernel's input and output. * * @param[in] compile_context The compile context to be used. - * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h index 0b5b22cafc..dd85d8a97c 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h @@ -33,7 +33,7 @@ class ICLTensor; /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED * - * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. * The following computations will be performed by the kernel: * * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h index 0d7d1c3390..f36076dfa2 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h @@ -32,7 +32,7 @@ class ICLTensor; /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED * - * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. * The following computations will be performed by the kernel: * * -# Add offset terms to final result diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h index 2845d9259e..36cd7bf693 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h @@ -32,7 +32,7 @@ class ICLTensor; /** CL kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16 * - * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value. + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QSYMM16 value. * The following computations will be performed by the kernel: * * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h index a768b6fba0..fd95e00d5d 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h @@ -32,7 +32,7 @@ class ICLTensor; /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED * - * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value. + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8_SIGNED value. * The following computations will be performed by the kernel: * * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h index e319c32c78..1714a02f76 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -32,7 +32,7 @@ class ICLTensor; /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 * - * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8 value. * The following computations will be performed by the kernel: * * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier -- cgit v1.2.1