From 05288a2b871ef99f544771621c3bba409b2f70df Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Tue, 21 Nov 2017 10:57:50 +0000 Subject: COMPMID-697 - Rework GEMMLowp interface on OpenCL Reworked the interface of GemmLowp in order to make easy the integration in Android NN - Added support for different output stage - Added validation for both matrix multiplication and output stage - Added bounded relu support in the output stage - Added in32_t bias support - Added optimized path for vector by matrix case This rework is required for: - Convolution quantized - Fully connected quantized Change-Id: I512283d406099cf8c614dd89d0a97ed411143afc Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110625 Reviewed-by: Georgios Pinitas Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com --- arm_compute/core/CL/CLKernels.h | 3 + .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h | 31 +- .../kernels/CLGEMMLowpOffsetContributionKernel.h | 82 ++++ ...CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h | 84 ++++ .../core/CL/kernels/CLGEMMLowpReductionKernel.h | 99 ++++ .../kernels/NEGEMMLowpOffsetContributionKernel.h | 4 +- ...NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h | 2 +- .../core/NEON/kernels/NEGEMMLowpReductionKernel.h | 2 +- arm_compute/runtime/CL/CLFunctions.h | 3 +- arm_compute/runtime/CL/functions/CLGEMMLowp.h | 89 ---- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h | 91 ++++ .../runtime/CL/functions/CLGEMMLowpOutputStage.h | 78 +++ docs/00_introduction.dox | 2 +- src/core/CL/CLKernelLibrary.cpp | 11 +- src/core/CL/cl_kernels/gemm.cl | 104 ---- src/core/CL/cl_kernels/gemmlowp.cl | 540 +++++++++++++++++++++ .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp | 99 ++-- .../kernels/CLGEMMLowpOffsetContributionKernel.cpp | 162 +++++++ ...GEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp | 128 +++++ src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp | 162 +++++++ .../NEON/kernels/NEGEMMLowpReductionKernel.cpp | 2 +- src/runtime/CL/functions/CLGEMMLowp.cpp | 93 ---- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 178 +++++++ src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp | 37 ++ .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 10 +- tests/datasets/LargeGEMMLowpDataset.h | 2 + tests/datasets/SmallGEMMLowpDataset.h | 4 +- tests/validation/CL/GEMMLowp.cpp | 172 +++++++ tests/validation/NEON/GEMMLowp.cpp | 18 +- 29 files changed, 1932 insertions(+), 360 deletions(-) create mode 100644 arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h create mode 100644 arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h create mode 100644 arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h delete mode 100644 arm_compute/runtime/CL/functions/CLGEMMLowp.h create mode 100644 arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h create mode 100644 arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h create mode 100644 src/core/CL/cl_kernels/gemmlowp.cl create mode 100644 src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp create mode 100644 src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp create mode 100644 src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp delete mode 100644 src/runtime/CL/functions/CLGEMMLowp.cpp create mode 100644 src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp create mode 100644 src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp create mode 100644 tests/validation/CL/GEMMLowp.cpp diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index 8da0cecad5..e80df6aada 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -58,6 +58,9 @@ #include "arm_compute/core/CL/kernels/CLFloorKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h index 05956aeeba..b60b80618c 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h @@ -30,15 +30,15 @@ namespace arm_compute { class ICLTensor; -/** OpenCL kernel to compute low precision matrix multiplication kernel +/** OpenCL kernel to multiply matrices * + * @note @ref CLGEMMLowpMatrixMultiplyKernel low precision matrix product kernel * This kernel performs the following computation: - * -# Convert a values from uint8 to int32 and add a_offset to each of them. - * -# Convert b values from uint8 to int32 and add b_offset to each of them. - * -# Compute the int32 matrix product of the resulting a * b. - * -# Add output_offset to each entry of the result. - * -# Multiply each entry of the result and round to the nearest integer - * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8. + * + * -# Convert a values from int8 to int32 + * -# Convert b values from int8 to int32 + * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 + * */ class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel { @@ -55,19 +55,12 @@ public: CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default; /** Initialise the kernel's input and output. * - * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel. - * These two kernels change the layout of the original matrices to be more cache-friendly. - * - * @param[in] input0 Input tensor containing the interleaved Matrix A. Data types supported: U8 - * @param[in] input1 Input tensor containing the transposed Matrix B. Data types supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication, Data types supported: same as @p input0 - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_offset Offset to be added to each element of the output matrix - * @param[in] output_mult_int Offset to be added to each element of the output matrix - * @param[in] shift Number of bits to shift right the result. + * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8 + * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0 + * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 + * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift); + void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h new file mode 100644 index 0000000000..5f2e025687 --- /dev/null +++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ +#define __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + * + */ +class CLGEMMLowpOffsetContributionKernel : public ICLKernel +{ +public: + /** Constructor */ + CLGEMMLowpOffsetContributionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete; + /** Allow instances of this class to be moved */ + CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default; + /** Allow instances of this class to be moved */ + CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in, out] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_vector_sum_col; + const ICLTensor *_vector_sum_row; + ICLTensor *_mm_result; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h new file mode 100644 index 0000000000..75f3750714 --- /dev/null +++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ +#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result if bias tensor is not a nullptr + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + */ +class CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel : public ICLKernel +{ +public: + /** Constructor */ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete; + /** Allow instances of this class to be moved */ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default; + /** Allow instances of this class to be moved */ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_offset Offset to be added to each element of the input matrix + * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_bias; + ICLTensor *_output; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ */ \ No newline at end of file diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h new file mode 100644 index 0000000000..aa0583fe81 --- /dev/null +++ b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__ +#define __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Common interface for all OpenCL reduction kernels */ +class ICLGEMMLowpReductionKernel : public ICLKernel +{ +public: + /** Constructor */ + ICLGEMMLowpReductionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete; + /** Allow instances of this class to be moved */ + ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default; + /** Allow instances of this class to be moved */ + ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default; + + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: S8 + * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 + */ + virtual void configure(const ICLTensor *input, ICLTensor *output) = 0; + +protected: + const ICLTensor *_input; + ICLTensor *_output; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8 + * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + */ + void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) override; + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8 + * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + */ + void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) override; + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h index 27cb3f2c1c..989260de11 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h @@ -30,9 +30,9 @@ namespace arm_compute { class ITensor; -/* NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place +/** NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place * - * This kernel takes a final int32 accumulator value (the output of @NEGEMMLowpMatrixMultiplyKernel), + * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), * and adds to it the offset contribution of matrix A and matrix B in-place. * * The final result is: diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h index 7684350c0f..a522069330 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h @@ -30,7 +30,7 @@ namespace arm_compute { class ITensor; -/* NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 +/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 * * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. * The following computations will be performed by the kernel: diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h index 9ca5cdf828..50d8b4070e 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h @@ -47,7 +47,7 @@ public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data type supported: S8 + * @param[in] input Input tensor. Data type supported: QASYMM8 * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 * @param[in] k Number of matrix A columns (or matrix B rows) * @param[in] is_reshaped True if the input tensor has been reshaped diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index 360372d192..9a20769ca1 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -59,7 +59,8 @@ #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" #include "arm_compute/runtime/CL/functions/CLGEMM.h" #include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h" -#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" #include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h" #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h" #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h deleted file mode 100644 index ffd997f6ec..0000000000 --- a/arm_compute/runtime/CL/functions/CLGEMMLowp.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_CLGEMMLOWP_H__ -#define __ARM_COMPUTE_CLGEMMLOWP_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h" -#include "arm_compute/runtime/CL/CLMemoryGroup.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/IMemoryManager.h" - -#include - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to execute GEMMLowp on OpenCL. This function calls the following OpenCL kernels: -* -* -# @ref CLGEMMInterleave4x4Kernel -* -# @ref CLGEMMTranspose1xWKernel -* -# @ref CLGEMMLowpMatrixMultiplyKernel -* -*/ -class CLGEMMLowp : public IFunction -{ -public: - /** Constructor */ - CLGEMMLowp(std::shared_ptr memory_manager = nullptr); - /** Initialise the kernel's inputs, output - * - * @note GEMM_LOWP: low precision matrix multiply kernel - * This kernel performs the following computation: - * - * -# Convert a values from uint8 to int32 and add a_offset to each of them. - * -# Convert b values from uint8 to int32 and add b_offset to each of them. - * -# Compute the int32 matrix product of the resulting a * b. - * -# Add output_offset to each entry of the result. - * -# Multiply each entry of the result and round to the nearest integer - * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8. - * - * @param[in] a First input tensor (Matrix A). Data types supported: U8. - * @param[in] b Second input tensor (Matrix B). Data types supported: same as @p a. - * @param[out] output Output tensor. Data types supported: same as @p a. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_offset Offset to be added to each element of the output matrix - * @param[in] output_mult_int Multiplied with each element of the output matrix - * @param[in] shift Number of bits to shift right the result. - */ - void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift); - - // Inherited methods overridden: - void run() override; - -private: - CLMemoryGroup _memory_group; - CLGEMMInterleave4x4Kernel _interleave_kernel; - CLGEMMTranspose1xWKernel _transpose_kernel; - CLGEMMLowpMatrixMultiplyKernel _mm_kernel; - CLTensor _tmp_a; - CLTensor _tmp_b; -}; -} -#endif /*__ARM_COMPUTE_CLGEMMLOWP_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h new file mode 100644 index 0000000000..9944afeac7 --- /dev/null +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ +#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ + +#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h" +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class IMemoryManager; +class ICLTensor; + +/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the following OpenCL kernels: + * + * -# @ref CLGEMMInterleave4x4Kernel (if the output tensor is a matrix) + * -# @ref CLGEMMTranspose1xWKernel (if the output tensor is a matrix) + * -# @ref CLGEMMLowpMatrixMultiplyKernel + * -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0) + * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0) + * -# @ref CLGEMMLowpOffsetContributionKernel + * +*/ +class CLGEMMLowpMatrixMultiplyCore : public IFunction +{ +public: + /** Constructor */ + CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager = nullptr); + /** Initialise the kernel's inputs, output + * + * @note GEMM_LOWP: low precision GEMM kernel + * This kernel performs the following computations: + * + * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. + * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[out] output Output tensor. Data type supported: Data type supported: S32 + */ + void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output); + + // Inherited methods overridden: + void run() override; + +private: + CLMemoryGroup _memory_group; + CLGEMMLowpMatrixMultiplyKernel _mm_kernel; + CLGEMMInterleave4x4Kernel _mtx_a_reshape_kernel; + CLGEMMTranspose1xWKernel _mtx_b_reshape_kernel; + CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; + CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; + CLGEMMLowpOffsetContributionKernel _offset_contribution_kernel; + CLTensor _vector_sum_col; + CLTensor _vector_sum_row; + CLTensor _tmp_a; + CLTensor _tmp_b; + int32_t _a_offset; + int32_t _b_offset; + bool _is_interleaved_transposed; +}; +} +#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h new file mode 100644 index 0000000000..5c176a960b --- /dev/null +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ +#define __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +/** This file contains all available output stages for GEMMLowp on OpenCL. + * + * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore), + * and processes it to obtain the final ASYMM8 value. + * + * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md + */ + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8Scale on OpenCL. + * + * CLGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift + * The final result is: + * + * ((input[i][k] + result_offset) * result_mult_int) >> result_shift + * + * In case the bias tensor is provided, the final result is: + * + * ((input[i][k] + result_offset) * result_mult_int + bias[k]) >> result_shift + * + * This function calls the following OpenCL kernels: + * + * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel + * + * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions + * before the result is shifted right by result_shift +*/ +class CLGEMMLowpQuantizeDownInt32ToUint8Scale : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_offset Offset to be added to each element of the input matrix + * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); +}; +} +#endif /*__ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ */ \ No newline at end of file diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index b5a1d59f6a..cc12897278 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -253,7 +253,7 @@ v17.03.1 First Major public release of the sources - New CPP target introduced for C++ kernels shared between NEON and CL functions. - New padding calculation interface introduced and ported most kernels / functions to use it. - New OpenCL kernels / functions: - - @ref arm_compute::CLGEMMLowpMatrixMultiplyKernel / @ref arm_compute::CLGEMMLowp + - @ref arm_compute::CLGEMMLowpMatrixMultiplyKernel / arm_compute::CLGEMMLowp - New NEON kernels / functions: - @ref arm_compute::NENormalizationLayerKernel / @ref arm_compute::NENormalizationLayer - @ref arm_compute::NETransposeKernel / @ref arm_compute::NETranspose diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 6cc5a9a6b5..948fe441cf 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -218,7 +218,6 @@ const std::map CLKernelLibrary::_kernel_program_map = { "gemm_ma_qs8", "gemm.cl" }, { "gemm_ma_qs16", "gemm.cl" }, { "gemm_mv", "gemv.cl" }, - { "gemm_mm_interleaved_transposed_u8", "gemm.cl" }, { "gemm_mm_interleaved_transposed_f16", "gemm.cl" }, { "gemm_mm_interleaved_transposed_f32_midgard", "gemm.cl" }, { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" }, @@ -233,6 +232,12 @@ const std::map CLKernelLibrary::_kernel_program_map = { "gemm_transpose1x16", "gemm.cl" }, { "gemm_transpose1x8", "gemm.cl" }, { "gemm_transpose1x4", "gemm.cl" }, + { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" }, + { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" }, + { "gemmlowp_mm", "gemmlowp.cl" }, + { "gemmlowp_mm_interleaved_transposed", "gemmlowp.cl" }, + { "gemmlowp_offset_contribution", "gemmlowp.cl" }, + { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" }, { "harris_score_3x3", "harris_corners.cl" }, { "harris_score_5x5", "harris_corners.cl" }, { "harris_score_7x7", "harris_corners.cl" }, @@ -480,6 +485,10 @@ const std::map CLKernelLibrary::_program_source_map = { "gemm.cl", #include "./cl_kernels/gemm.clembed" + }, + { + "gemmlowp.cl", +#include "./cl_kernels/gemmlowp.clembed" }, { "gemv.cl", diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 15111ed352..c763cb355b 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -251,110 +251,6 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src), } #if defined(COLS_B) -/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) - * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication - * - * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B - * - * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8 - * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[in] src1_ptr Pointer to the source matrix. Supported formats: same as @p src0_ptr - * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported formats: same as @p src0_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - * @param[in] a_offset Offset to be added to each element of the matrix A - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] c_offset Offset to be added to each element of the matrix C. - * @param[in] c_mult_int Multiplied with each element of the matrix C. - * @param[in] shift Number of bits to shift right the result. - */ -__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0), - IMAGE_DECLARATION(src1), - IMAGE_DECLARATION(dst), - int a_offset, - int b_offset, - int c_offset, - int c_mult_int, - int shift) -{ - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B - - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); - - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); - - // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; - - // Reset accumulators - int16 c00 = 0.0f; - int16 c10 = 0.0f; - int16 c20 = 0.0f; - int16 c30 = 0.0f; - - for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32)) - { - // Load values from matrix A (interleaved) and matrix B (transposed) - int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0)); - int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1)); - - c00 += (int16)a0.s0 * b0; - c10 += (int16)a0.s1 * b0; - c20 += (int16)a0.s2 * b0; - c30 += (int16)a0.s3 * b0; - - int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16)); - - c00 += (int16)a0.s4 * b1; - c10 += (int16)a0.s5 * b1; - c20 += (int16)a0.s6 * b1; - c30 += (int16)a0.s7 * b1; - } - - for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16)) - { - // Load values from matrix A (interleaved) and matrix B (transposed) - int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0)); - int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1)); - - c00 += (int16)a0.s0 * b0; - c10 += (int16)a0.s1 * b0; - c20 += (int16)a0.s2 * b0; - c30 += (int16)a0.s3 * b0; - } - - // Compute destination address - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Multiply by the weight of matrix product - c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift; - c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift; - c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift; - c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift; - - // Store 4x16 block - vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0))); - vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1))); - vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2))); - vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3))); -} - /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication * diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl new file mode 100644 index 0000000000..7cd0c0b8db --- /dev/null +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(COLS_B) +/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication + * + * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemmlowp_mm_interleaved_transposed(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + // src_addr.s0 = address of matrix A + // src_addr.s1 = address of matrix B + // Compute address for matrix A and B + int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), + (src1_stride_y)); + + // Add offset_first_element_in_bytes + src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Compute end row address for matrix B + int end_row_mtx_b = src_addr.s1 + COLS_B; + + // Reset accumulators + int16 c00 = 0; + int16 c10 = 0; + int16 c20 = 0; + int16 c30 = 0; + + for(; src_addr.s1 <= (end_row_mtx_b - 32); src_addr += (int2)(8, 32)) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + int8 a0 = convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0)); + int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1)); + + c00 += (int16)a0.s0 * b0; + c10 += (int16)a0.s1 * b0; + c20 += (int16)a0.s2 * b0; + c30 += (int16)a0.s3 * b0; + + int16 b1 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16)); + + c00 += (int16)a0.s4 * b1; + c10 += (int16)a0.s5 * b1; + c20 += (int16)a0.s6 * b1; + c30 += (int16)a0.s7 * b1; + } + + for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16)) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + int4 a0 = convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0)); + int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1)); + + c00 += (int16)a0.s0 * b0; + c10 += (int16)a0.s1 * b0; + c20 += (int16)a0.s2 * b0; + c30 += (int16)a0.s3 * b0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Store 4x16 block + vstore16(c00, 0, (__global int *)(offset(&dst, 0, 0))); + vstore16(c10, 0, (__global int *)(offset(&dst, 0, 1))); + vstore16(c20, 0, (__global int *)(offset(&dst, 0, 2))); + vstore16(c30, 0, (__global int *)(offset(&dst, 0, 3))); +} +#endif // defined(COLS_B) + +#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A) +#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X) +#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X) +#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X) +/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped + * + * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +__kernel void gemmlowp_mm(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx; + + int end_row_vec_a = src_addr.s0 + COLS_A; + + VECTOR_UINT acc0 = 0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VECTOR_UINT acc1 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VECTOR_UINT acc2 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VECTOR_UINT acc3 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y)) + { + // Load values from matrix A + uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + // Load values from matrix B + VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1); + VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y); + + // Accumulate + acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0; + acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0; + acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0; + acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0; + acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y)) + { + // Load values from matrix A + uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + // Load values from matrix B + VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1); + + // Accumulate + acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Store the result + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0))); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1))); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2))); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3))); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +} +#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A) + +#if defined(COLS_A) +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + * + * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uint4 sum_row_u32 = (uint4)0; + uint sum_row = 0; + + __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z); + + int i = 0; + + // This for loop performs 16 accumulations + for(; i <= ((int)COLS_A - 16); i += 16) + { + const uchar16 a0_u8 = vload16(0, matrix_a + i); + + sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF); + } + + // This for loop performs the leftover accumulations + for(; i < COLS_A; ++i) + { + sum_row += matrix_a[i]; + } + + sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3; + + *((__global int *)dst.ptr) = (int)sum_row; +} +#endif // defined(COLS_A) + +#if defined(COLS_B) && defined(ROWS_B) +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + * + * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uint16 sum_col_u32 = (uint16)0; + + __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z); + + int i = 0; + // This for loop performs 4 accumulations + for(; i <= ((int)ROWS_B - 4); i += 4) + { + const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y); + const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y); + const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y); + const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y); + + sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8); + + matrix_b += 4 * src_stride_y; + } + + // This for loop perfoms the leftover accumulations + for(; i < (int)ROWS_B; ++i) + { + const uchar16 b0_u8 = vload16(0, matrix_b); + + sum_col_u32 += convert_uint16(b0_u8); + + matrix_b += src_stride_y; + } + + vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr); +} +#endif // defined(COLS_B) && defined(ROWS_B) + +#if defined(K_OFFSET) +/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) + * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) + * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (sum_col[k] * A_OFFSET) + + * (sum_row[i] * B_OFFSET) + + * (K_OFFSET) + * + * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32 + * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result) +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) + ) +{ + Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result); + + int16 a_offset_s32 = (int16)0; + int16 b_offset_s32 = (int16)0; + +#if defined(A_OFFSET) + Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col); + + // Compute the offset contribution due to A_OFFSET + a_offset_s32 = vload16(0, (__global int *)sum_col.ptr + get_global_id(2) * sum_col_stride_y); + a_offset_s32 *= (int16)A_OFFSET; +#endif // defined(A_OFFSET) + +#if defined(B_OFFSET) + Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row); + + // Compute the offset contribution due to B_OFFSET + b_offset_s32 = (int16) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1)); + b_offset_s32 *= (int16)B_OFFSET; +#endif // defined(B_OFFSET) + + const int16 offset_term_s32 = (int16)K_OFFSET + a_offset_s32 + b_offset_s32; + + int16 in_s32 = vload16(0, (__global int *)mm_result.ptr); + + // Add the offset terms to GEMM's result + in_s32 += offset_term_s32; + + // Store the result with the offset contribution + vstore16(in_s32, 0, (__global int *)mm_result.ptr); +} +#endif // defined(K_OFFSET) + +#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) +/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result (if -DADD_BIAS is passed at compile time) + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time) + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time + * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. + * These values can be used to implement "rectified linear unit" activation functions + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); +#if defined(ADD_BIAS) + Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); +#endif // defined(ADD_BIAS) + + int16 input_values = vload16(0, (__global int *)src.ptr); + + // Add the offset terms to GEMM's result + input_values += (int16)RESULT_OFFSET; + + // Multiply by result_mult_int + input_values *= (int16)RESULT_MULT_INT; + +#if defined(ADD_BIAS) + // Add bias + const int16 biases_values = vload16(0, (__global int *)biases.ptr); + input_values += (int16)biases_values; +#endif // defined(ADD_BIAS) + + // Shift final result + input_values >>= RESULT_SHIFT; + + // Saturate negative values + input_values = max(input_values, (int16)0); + + uchar16 res = convert_uchar16_sat(input_values); + +#if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore16(res, 0, dst.ptr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp index ef572cfc7e..b3227c0db9 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp @@ -51,45 +51,88 @@ CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel() { } -void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, - int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift) +void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + if(!is_interleaved_transposed) + { + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + } + + TensorShape in1_shape = input1->info()->tensor_shape(); + in1_shape.collapse(2); _input0 = input0; _input1 = input1; _output = output; - // Create kernel and set static arguments - std::set build_opts = { ("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))) }; - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_u8", build_opts)); - unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, a_offset); - _kernel.setArg(idx++, b_offset); - _kernel.setArg(idx++, output_offset); - _kernel.setArg(idx++, output_mult_int); - _kernel.setArg(idx++, shift); + CLBuildOptions build_opts; - // Configure window - constexpr unsigned int num_elems_processed_per_iteration_x = 16; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - constexpr unsigned int num_elems_read_per_iteration_input0 = 4; - constexpr unsigned int num_elems_read_per_iteration_input1 = 16; + if(is_interleaved_transposed) + { + // Create kernel and set static arguments + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_mm_interleaved_transposed", build_opts.options())); - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + // Configure window + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + constexpr unsigned int num_elems_read_per_iteration_input0 = 4; + constexpr unsigned int num_elems_read_per_iteration_input1 = 16; - AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1); - AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - update_window_and_padding(win, input0_access, input1_access, output_access); + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1); + AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); + } + else + { + // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); + + build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x)); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y)); + + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_mm", build_opts.options())); + + // Configure window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); + AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1)); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure(win); + } - ICLKernel::configure(win); + // Set config_id for enabling LWS tuning + _config_id = "gemmlowp_"; + _config_id += (is_interleaved_transposed ? "reshaped_" : ""); + _config_id += lower_string(string_from_data_type(input0->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += "_"; + _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) @@ -117,7 +160,7 @@ void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue add_2D_tensor_argument(idx, _input0, slice); add_2D_tensor_argument(idx, _input1, slice_b); add_2D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice); + enqueue(queue, *this, slice, _lws_hint); } while(window.slide_window_slice_2D(slice)); } diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp new file mode 100644 index 0000000000..96919fe3cb --- /dev/null +++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/ToolchainSupport.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel() + : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr) +{ +} + +void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0)); + + TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); + vector_sum_col_shape.collapse(1); + + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(b_offset != 0) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1)); + + TensorShape output_shape = mm_result->info()->tensor_shape(); + TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape(); + vector_sum_row_shape.collapse(1); + output_shape.collapse(2); + + ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor"); + + if(a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); + vector_sum_col_shape.collapse(1); + + ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 + && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + + build_opts.add_option("-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + } + + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options())); + + _vector_sum_col = vector_sum_col; + _vector_sum_row = vector_sum_row; + _mm_result = mm_result; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, mm_result_access); + + if(a_offset != 0) + { + AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, vector_sum_col_access); + } + + if(b_offset != 0) + { + AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0); + update_window_and_padding(win, vector_sum_row_access); + } + + ICLKernel::configure(win); +} + +void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Set window for vector_sum_col + Window win_vector_sum_col = slice; + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row = slice; + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _mm_result, slice); + if(_vector_sum_col != nullptr) + { + add_2D_tensor_argument(idx, _vector_sum_col, win_vector_sum_col); + } + if(_vector_sum_row != nullptr) + { + add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row); + } + enqueue(queue, *this, slice); + } + while(collapsed.slide_window_slice_3D(slice)); +} diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp new file mode 100644 index 0000000000..fa6a48e77c --- /dev/null +++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel() + : _input(nullptr), _bias(nullptr), _output(nullptr) +{ +} + +void CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, + int max) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON(max > 255); + ARM_COMPUTE_ERROR_ON(min < 0 || min > max); + + if(bias != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0)); + } + + _input = input; + _bias = bias; + _output = output; + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(result_offset)); + build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(result_mult_int)); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift)); + build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down", build_opts.options())); + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + input_access, + output_result_access); + + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]); + + update_window_and_padding(win, + bias_access); + } + + output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + unsigned int idx1 = num_arguments_per_3D_tensor(); + if(_bias != nullptr) + { + Window biases_slice(slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + add_1D_tensor_argument(idx1, _bias, biases_slice); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx1, _output, slice); + enqueue(queue, *this, slice); + } + while(collapsed.slide_window_slice_3D(slice)); +} \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp new file mode 100644 index 0000000000..6f410d3b14 --- /dev/null +++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/ToolchainSupport.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel() + : _input(), _output() +{ +} + +void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + _input = mtx_a; + _output = vector_sum_row; + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0))); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options())); + + const unsigned int num_elems_processed_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1)); + AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + input_access, + output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); + Window slice_in = collapsed.first_slice_window_2D(); + Window slice_out = collapsed.first_slice_window_2D(); + + // Setup input slice. Its dimensions are increased in the cl kernel. + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } + while(collapsed.slide_window_slice_2D(slice_out)); +} + +void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + + _input = mtx_b; + _output = vector_sum_col; + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0))); + build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1))); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_b_reduction", build_opts.options())); + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1)); + AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + input_access, + output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY); + + Window slice_out = collapsed.first_slice_window_2D(); + Window slice_in = slice_out; + + slice_in.set(Window::DimY, Window::Dimension(0, 1, 1)); + slice_in.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_2D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } + while(collapsed.slide_window_slice_2D(slice_out)); +} diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp index a8395a15cb..81094f8743 100644 --- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp @@ -209,7 +209,7 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf uint32x4_t sum_row_u32 = vdupq_n_u32(0); uint32_t sum_row = 0; - const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + +id.y() * _input->info()->strides_in_bytes()[2]); + const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]); #if __arm__ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp deleted file mode 100644 index db6d11c2c3..0000000000 --- a/src/runtime/CL/functions/CLGEMMLowp.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLGEMMLowp::CLGEMMLowp(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b() -{ -} - -void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output); - ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C"); - - // Create shape for interleaved temporary tensor - TensorShape shape_tmp_a = a->info()->tensor_shape(); - shape_tmp_a.set(0, a->info()->dimension(0) * 4); - shape_tmp_a.set(1, ceil(a->info()->dimension(1) / 4)); - TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type()); - _tmp_a.allocator()->init(info_a); - - // Create shape for tranposed temporary tensor - TensorShape shape_tmp_b = b->info()->tensor_shape(); - shape_tmp_b.set(0, b->info()->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(static_cast(b->info()->dimension(0)) / 16)); - TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type()); - _tmp_b.allocator()->init(info_b); - - // Manage intermediate buffers - _memory_group.manage(&_tmp_a); - _memory_group.manage(&_tmp_b); - - // Configure kernels - _interleave_kernel.configure(a, &_tmp_a); - _transpose_kernel.configure(b, &_tmp_b); - _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift); - - // Allocate intermediate buffers - _tmp_a.allocator()->allocate(); - _tmp_b.allocator()->allocate(); -} - -void CLGEMMLowp::run() -{ - _memory_group.acquire(); - - /* Run interleave kernel */ - CLScheduler::get().enqueue(_interleave_kernel, false); - - /* Run transpose kernel */ - CLScheduler::get().enqueue(_transpose_kernel, false); - - /* Run matrix multiply kernel */ - CLScheduler::get().enqueue(_mm_kernel, false); - - _memory_group.release(); -} diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp new file mode 100644 index 0000000000..5d2d13e243 --- /dev/null +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), + _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true) +{ +} + +void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A"); + ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B"); + + _a_offset = a->info()->quantization_info().offset; + _b_offset = b->info()->quantization_info().offset; + + // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors + _is_interleaved_transposed = a->info()->dimension(1) > 16; + + const ICLTensor *matrix_a = a; + const ICLTensor *matrix_b = b; + + if(_is_interleaved_transposed) + { + matrix_a = &_tmp_a; + matrix_b = &_tmp_b; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + TensorShape shape_tmp_a = a->info()->tensor_shape(); + shape_tmp_a.set(0, a->info()->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + TensorShape shape_tmp_b = b->info()->tensor_shape(); + shape_tmp_b.set(0, b->info()->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f)); + + TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type()); + TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type()); + _tmp_a.allocator()->init(info_a); + _tmp_b.allocator()->init(info_b); + _memory_group.manage(&_tmp_a); + _memory_group.manage(&_tmp_b); + + // Configure interleave kernel + _mtx_a_reshape_kernel.configure(a, &_tmp_a); + + // Configure transpose kernel + _mtx_b_reshape_kernel.configure(b, &_tmp_b); + } + + // Configure matrix multiply kernel + _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed); + + // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0) + { + TensorShape shape_vector_sum_col = b->info()->tensor_shape(); + if(b->info()->num_dimensions() > 1) + { + shape_vector_sum_col.remove_dimension(1); + } + TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32); + _vector_sum_col.allocator()->init(info_vector_sum_col); + _memory_group.manage(&_vector_sum_col); + + // Configure Matrix B reduction kernel + _mtx_b_reduction_kernel.configure(b, &_vector_sum_col); + } + + // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 + if(_b_offset != 0) + { + TensorShape shape_vector_sum_row = a->info()->tensor_shape(); + shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1)); + if(a->info()->num_dimensions() > 1) + { + shape_vector_sum_row.remove_dimension(1); + } + TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32); + _vector_sum_row.allocator()->init(info_vector_sum_row); + _memory_group.manage(&_vector_sum_row); + + // Configure matrix A reduction kernel + _mtx_a_reduction_kernel.configure(a, &_vector_sum_row); + } + + // Configure offset contribution kernel + _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset); + + // Allocate tensors + if(_is_interleaved_transposed) + { + _tmp_a.allocator()->allocate(); + _tmp_b.allocator()->allocate(); + } + + if(_a_offset != 0) + { + _vector_sum_col.allocator()->allocate(); + } + + if(_b_offset != 0) + { + _vector_sum_row.allocator()->allocate(); + } +} + +void CLGEMMLowpMatrixMultiplyCore::run() +{ + _memory_group.acquire(); + + if(_is_interleaved_transposed) + { + // Run reshape matrix A + CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false); + + // Run reshape matrix B + CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false); + } + + // Run matrix multiply + CLScheduler::get().enqueue(_mm_kernel, false); + + // Run matrix A reduction kernel only if _b_offset is not equal to 0 + if(_b_offset != 0) + { + CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0) + { + CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false); + } + + // Run offset contribution kernel + CLScheduler::get().enqueue(_offset_contribution_kernel, true); + + _memory_group.release(); +} diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp new file mode 100644 index 0000000000..b1d620d8a2 --- /dev/null +++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max); + _kernel = std::move(k); +} \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 92c911c370..da5ac22fdc 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -133,7 +133,10 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, if(_a_offset != 0) { TensorShape shape_vector_sum_col = b->info()->tensor_shape(); - shape_vector_sum_col.remove_dimension(1); + if(b->info()->num_dimensions() > 1) + { + shape_vector_sum_col.remove_dimension(1); + } TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32); _vector_sum_col.allocator()->init(info_vector_sum_col); _memory_group.manage(&_vector_sum_col); @@ -147,7 +150,10 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, { TensorShape shape_vector_sum_row = a->info()->tensor_shape(); shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1)); - shape_vector_sum_row.remove_dimension(1); + if(a->info()->num_dimensions() > 1) + { + shape_vector_sum_row.remove_dimension(1); + } TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32); _vector_sum_row.allocator()->init(info_vector_sum_row); _memory_group.manage(&_vector_sum_row); diff --git a/tests/datasets/LargeGEMMLowpDataset.h b/tests/datasets/LargeGEMMLowpDataset.h index cc1feb49a2..87f879e70a 100644 --- a/tests/datasets/LargeGEMMLowpDataset.h +++ b/tests/datasets/LargeGEMMLowpDataset.h @@ -42,7 +42,9 @@ class LargeGEMMLowpDataset final : public GEMMLowpDataset public: LargeGEMMLowpDataset() { + add_config(TensorShape(923U, 2U), TensorShape(871U, 923U), TensorShape(871U, 2U), 0, 0); add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0); + add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3); add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4); add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0); add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13); diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h index 881546e70f..1d4ab53be5 100644 --- a/tests/datasets/SmallGEMMLowpDataset.h +++ b/tests/datasets/SmallGEMMLowpDataset.h @@ -42,11 +42,13 @@ class SmallGEMMLowpDataset final : public GEMMLowpDataset public: SmallGEMMLowpDataset() { + add_config(TensorShape(21U, 2U), TensorShape(43U, 21U), TensorShape(43U, 2U), 0, 0); add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0); + add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13); add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4); add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0); add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13); - add_config(TensorShape(38U, 12U), TensorShape(21U, 38U), TensorShape(21U, 12U), -3, -2); + add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), -3, -2); add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1); } }; diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp new file mode 100644 index 0000000000..1968efcedc --- /dev/null +++ b/tests/validation/CL/GEMMLowp.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" +#include "tests/CL/CLAccessor.h" +#include "tests/PaddingCalculator.h" +#include "tests/datasets/LargeGEMMLowpDataset.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/datasets/SmallGEMMLowpDataset.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/GEMMLowpFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +TEST_SUITE(CL) +TEST_SUITE(GEMMLowp) + +TEST_SUITE(MatrixMultiplyCore) +using CLGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture; + +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()), + shape_a, shape_b, shape_c, a_offset, b_offset) +{ + // Create tensors + CLTensor a = create_tensor(shape_a, DataType::QASYMM8); + CLTensor b = create_tensor(shape_b, DataType::QASYMM8); + CLTensor c = create_tensor(shape_c, DataType::S32); + + a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset)); + b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset)); + + ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + CLGEMMLowpMatrixMultiplyCore gemmlowp_mm; + gemmlowp_mm.configure(&a, &b, &c); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset()) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset()) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} + +TEST_SUITE_END() // MatrixMultiplyCore + +TEST_SUITE(OutputStage) +TEST_SUITE(QuantizeDownInt32ToUint8Scale) + +const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2, + 3) + * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true }); + +const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, + 2) + * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 173) * framework::dataset::make("addBias", { false, true }); + +using CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture; + +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()), quantize_down_int32_to_uint8_scale_cases), + shape, result_offset, result_mult_int, result_shift, min, max, add_bias) +{ + TensorShape shape_bias(shape[0]); + + // Create tensors + CLTensor in = create_tensor(shape, DataType::S32); + CLTensor bias = create_tensor(shape_bias, DataType::S32); + CLTensor out = create_tensor(shape, DataType::QASYMM8); + + ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + CLGEMMLowpQuantizeDownInt32ToUint8Scale output_stage; + output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_offset, result_mult_int, result_shift, min, max); + + // Validate valid region input and output + const ValidRegion valid_region = shape_to_valid_region(shape); + validate(in.info()->valid_region(), valid_region); + validate(out.info()->valid_region(), valid_region); + + // Validate valid region bias + if(add_bias) + { + const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias); + validate(bias.info()->valid_region(), valid_region_bias); + } + + // Validate padding + const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding(); + validate(in.info()->padding(), padding); + validate(out.info()->padding(), padding); + + if(add_bias) + { + validate(bias.info()->padding(), padding); + } +} + +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases)) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_cases)) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} + +TEST_SUITE(BoundedReLu) +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // BoundedReLu + +TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale +TEST_SUITE_END() // OutputStage +TEST_SUITE_END() // GEMMLowp +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp index 6366223820..6d13fdc939 100644 --- a/tests/validation/NEON/GEMMLowp.cpp +++ b/tests/validation/NEON/GEMMLowp.cpp @@ -177,11 +177,11 @@ TEST_SUITE(OutputStage) TEST_SUITE(QuantizeDownInt32ToUint8Scale) -const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2, +const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true }); -const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1, +const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true }); @@ -254,20 +254,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, } TEST_SUITE_END() // BoundedReLu -TEST_SUITE(AddBias) -FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) -{ - // Validate output - validate(Accessor(_target), _reference); -} - -FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases)) -{ - // Validate output - validate(Accessor(_target), _reference); -} -TEST_SUITE_END() // AddBias - TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale TEST_SUITE_END() // OutputStage -- cgit v1.2.1