aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2017-11-21 10:57:50 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:41:17 +0000
commit05288a2b871ef99f544771621c3bba409b2f70df (patch)
tree21e3d2a9927ef31f6d5bcdd5523c4c8e933047a6
parentc82799003fbfdc5bb9526ff944e41eaae23e3f03 (diff)
downloadComputeLibrary-05288a2b871ef99f544771621c3bba409b2f70df.tar.gz
COMPMID-697 - Rework GEMMLowp interface on OpenCL
Reworked the interface of GemmLowp in order to make easy the integration in Android NN - Added support for different output stage - Added validation for both matrix multiplication and output stage - Added bounded relu support in the output stage - Added in32_t bias support - Added optimized path for vector by matrix case This rework is required for: - Convolution quantized - Fully connected quantized Change-Id: I512283d406099cf8c614dd89d0a97ed411143afc Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110625 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/CL/CLKernels.h3
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h31
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h82
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h84
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h99
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h4
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h2
-rw-r--r--arm_compute/runtime/CL/CLFunctions.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowp.h89
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h91
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h78
-rw-r--r--docs/00_introduction.dox2
-rw-r--r--src/core/CL/CLKernelLibrary.cpp11
-rw-r--r--src/core/CL/cl_kernels/gemm.cl104
-rw-r--r--src/core/CL/cl_kernels/gemmlowp.cl540
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp99
-rw-r--r--src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp162
-rw-r--r--src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp128
-rw-r--r--src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp162
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp2
-rw-r--r--src/runtime/CL/functions/CLGEMMLowp.cpp93
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp178
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp37
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp10
-rw-r--r--tests/datasets/LargeGEMMLowpDataset.h2
-rw-r--r--tests/datasets/SmallGEMMLowpDataset.h4
-rw-r--r--tests/validation/CL/GEMMLowp.cpp172
-rw-r--r--tests/validation/NEON/GEMMLowp.cpp18
29 files changed, 1932 insertions, 360 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 8da0cecad5..e80df6aada 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -58,6 +58,9 @@
#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
index 05956aeeba..b60b80618c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
@@ -30,15 +30,15 @@ namespace arm_compute
{
class ICLTensor;
-/** OpenCL kernel to compute low precision matrix multiplication kernel
+/** OpenCL kernel to multiply matrices
*
+ * @note @ref CLGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
* This kernel performs the following computation:
- * -# Convert a values from uint8 to int32 and add a_offset to each of them.
- * -# Convert b values from uint8 to int32 and add b_offset to each of them.
- * -# Compute the int32 matrix product of the resulting a * b.
- * -# Add output_offset to each entry of the result.
- * -# Multiply each entry of the result and round to the nearest integer
- * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ *
+ * -# Convert a values from int8 to int32
+ * -# Convert b values from int8 to int32
+ * -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+ *
*/
class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
{
@@ -55,19 +55,12 @@ public:
CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
/** Initialise the kernel's input and output.
*
- * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel.
- * These two kernels change the layout of the original matrices to be more cache-friendly.
- *
- * @param[in] input0 Input tensor containing the interleaved Matrix A. Data types supported: U8
- * @param[in] input1 Input tensor containing the transposed Matrix B. Data types supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication, Data types supported: same as @p input0
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] output_offset Offset to be added to each element of the output matrix
- * @param[in] output_mult_int Offset to be added to each element of the output matrix
- * @param[in] shift Number of bits to shift right the result.
+ * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
+ * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
+ * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
+ * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
*/
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+ void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
new file mode 100644
index 0000000000..5f2e025687
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ * (vector_sum_col[k] * a_offset) +
+ * (vector_sum_row[i] * b_offset) +
+ * (a_offset * b_offset * k)
+ *
+ */
+class CLGEMMLowpOffsetContributionKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLGEMMLowpOffsetContributionKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in, out] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+ * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+ * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+ * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+ * @param[in] k Number of matrix A columns or Matrix B rows
+ * @param[in] a_offset Offset to be added to each element of the matrix A.
+ * @param[in] b_offset Offset to be added to each element of the matrix B.
+ */
+ void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_vector_sum_col;
+ const ICLTensor *_vector_sum_row;
+ ICLTensor *_mm_result;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
new file mode 100644
index 0000000000..75f3750714
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
+#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ */
+class CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] result_offset Offset to be added to each element of the input matrix
+ * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+ * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ */
+ void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_bias;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ */ \ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
new file mode 100644
index 0000000000..aa0583fe81
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Common interface for all OpenCL reduction kernels */
+class ICLGEMMLowpReductionKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ ICLGEMMLowpReductionKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
+
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data type supported: S8
+ * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+ */
+ virtual void configure(const ICLTensor *input, ICLTensor *output) = 0;
+
+protected:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] mtx_a Input tensor. Data type supported: QASYMM8
+ * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+ */
+ void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) override;
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8
+ * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+ */
+ void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) override;
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
index 27cb3f2c1c..989260de11 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
{
class ITensor;
-/* NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+/** NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
*
- * This kernel takes a final int32 accumulator value (the output of @NEGEMMLowpMatrixMultiplyKernel),
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
* and adds to it the offset contribution of matrix A and matrix B in-place.
*
* The final result is:
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
index 7684350c0f..a522069330 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
{
class ITensor;
-/* NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
*
* This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
* The following computations will be performed by the kernel:
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 9ca5cdf828..50d8b4070e 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -47,7 +47,7 @@ public:
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data type supported: S8
+ * @param[in] input Input tensor. Data type supported: QASYMM8
* @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
* @param[in] k Number of matrix A columns (or matrix B rows)
* @param[in] is_reshaped True if the input tensor has been reshaped
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 360372d192..9a20769ca1 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -59,7 +59,8 @@
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
#include "arm_compute/runtime/CL/functions/CLGEMM.h"
#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
#include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
deleted file mode 100644
index ffd997f6ec..0000000000
--- a/arm_compute/runtime/CL/functions/CLGEMMLowp.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLGEMMLOWP_H__
-#define __ARM_COMPUTE_CLGEMMLOWP_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
-#include "arm_compute/runtime/CL/CLMemoryGroup.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute GEMMLowp on OpenCL. This function calls the following OpenCL kernels:
-*
-* -# @ref CLGEMMInterleave4x4Kernel
-* -# @ref CLGEMMTranspose1xWKernel
-* -# @ref CLGEMMLowpMatrixMultiplyKernel
-*
-*/
-class CLGEMMLowp : public IFunction
-{
-public:
- /** Constructor */
- CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Initialise the kernel's inputs, output
- *
- * @note GEMM_LOWP: low precision matrix multiply kernel
- * This kernel performs the following computation:
- *
- * -# Convert a values from uint8 to int32 and add a_offset to each of them.
- * -# Convert b values from uint8 to int32 and add b_offset to each of them.
- * -# Compute the int32 matrix product of the resulting a * b.
- * -# Add output_offset to each entry of the result.
- * -# Multiply each entry of the result and round to the nearest integer
- * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
- *
- * @param[in] a First input tensor (Matrix A). Data types supported: U8.
- * @param[in] b Second input tensor (Matrix B). Data types supported: same as @p a.
- * @param[out] output Output tensor. Data types supported: same as @p a.
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] output_offset Offset to be added to each element of the output matrix
- * @param[in] output_mult_int Multiplied with each element of the output matrix
- * @param[in] shift Number of bits to shift right the result.
- */
- void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- CLMemoryGroup _memory_group;
- CLGEMMInterleave4x4Kernel _interleave_kernel;
- CLGEMMTranspose1xWKernel _transpose_kernel;
- CLGEMMLowpMatrixMultiplyKernel _mm_kernel;
- CLTensor _tmp_a;
- CLTensor _tmp_b;
-};
-}
-#endif /*__ARM_COMPUTE_CLGEMMLOWP_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
new file mode 100644
index 0000000000..9944afeac7
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__
+#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IMemoryManager;
+class ICLTensor;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLGEMMInterleave4x4Kernel (if the output tensor is a matrix)
+ * -# @ref CLGEMMTranspose1xWKernel (if the output tensor is a matrix)
+ * -# @ref CLGEMMLowpMatrixMultiplyKernel
+ * -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
+ * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
+ * -# @ref CLGEMMLowpOffsetContributionKernel
+ *
+*/
+class CLGEMMLowpMatrixMultiplyCore : public IFunction
+{
+public:
+ /** Constructor */
+ CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Initialise the kernel's inputs, output
+ *
+ * @note GEMM_LOWP: low precision GEMM kernel
+ * This kernel performs the following computations:
+ *
+ * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+ * -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+ * -# Compute the matrix product of the resulting a * b in int32.
+ *
+ * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8.
+ * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
+ * @param[out] output Output tensor. Data type supported: Data type supported: S32
+ */
+ void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ CLMemoryGroup _memory_group;
+ CLGEMMLowpMatrixMultiplyKernel _mm_kernel;
+ CLGEMMInterleave4x4Kernel _mtx_a_reshape_kernel;
+ CLGEMMTranspose1xWKernel _mtx_b_reshape_kernel;
+ CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
+ CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
+ CLGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+ CLTensor _vector_sum_col;
+ CLTensor _vector_sum_row;
+ CLTensor _tmp_a;
+ CLTensor _tmp_b;
+ int32_t _a_offset;
+ int32_t _b_offset;
+ bool _is_interleaved_transposed;
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
new file mode 100644
index 0000000000..5c176a960b
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__
+#define __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+/** This file contains all available output stages for GEMMLowp on OpenCL.
+ *
+ * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore),
+ * and processes it to obtain the final ASYMM8 value.
+ *
+ * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8Scale on OpenCL.
+ *
+ * CLGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
+ * The final result is:
+ *
+ * ((input[i][k] + result_offset) * result_mult_int) >> result_shift
+ *
+ * In case the bias tensor is provided, the final result is:
+ *
+ * ((input[i][k] + result_offset) * result_mult_int + bias[k]) >> result_shift
+ *
+ * This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+ *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ * before the result is shifted right by result_shift
+*/
+class CLGEMMLowpQuantizeDownInt32ToUint8Scale : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's inputs, output
+ *
+ * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] result_offset Offset to be added to each element of the input matrix
+ * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+ * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ */
+ void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ */ \ No newline at end of file
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index b5a1d59f6a..cc12897278 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -253,7 +253,7 @@ v17.03.1 First Major public release of the sources
- New CPP target introduced for C++ kernels shared between NEON and CL functions.
- New padding calculation interface introduced and ported most kernels / functions to use it.
- New OpenCL kernels / functions:
- - @ref arm_compute::CLGEMMLowpMatrixMultiplyKernel / @ref arm_compute::CLGEMMLowp
+ - @ref arm_compute::CLGEMMLowpMatrixMultiplyKernel / arm_compute::CLGEMMLowp
- New NEON kernels / functions:
- @ref arm_compute::NENormalizationLayerKernel / @ref arm_compute::NENormalizationLayer
- @ref arm_compute::NETransposeKernel / @ref arm_compute::NETranspose
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6cc5a9a6b5..948fe441cf 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -218,7 +218,6 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "gemm_ma_qs8", "gemm.cl" },
{ "gemm_ma_qs16", "gemm.cl" },
{ "gemm_mv", "gemv.cl" },
- { "gemm_mm_interleaved_transposed_u8", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f32_midgard", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
@@ -233,6 +232,12 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "gemm_transpose1x16", "gemm.cl" },
{ "gemm_transpose1x8", "gemm.cl" },
{ "gemm_transpose1x4", "gemm.cl" },
+ { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
+ { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
+ { "gemmlowp_mm", "gemmlowp.cl" },
+ { "gemmlowp_mm_interleaved_transposed", "gemmlowp.cl" },
+ { "gemmlowp_offset_contribution", "gemmlowp.cl" },
+ { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" },
{ "harris_score_3x3", "harris_corners.cl" },
{ "harris_score_5x5", "harris_corners.cl" },
{ "harris_score_7x7", "harris_corners.cl" },
@@ -482,6 +487,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
#include "./cl_kernels/gemm.clembed"
},
{
+ "gemmlowp.cl",
+#include "./cl_kernels/gemmlowp.clembed"
+ },
+ {
"gemv.cl",
#include "./cl_kernels/gemv.clembed"
},
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 15111ed352..c763cb355b 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -251,110 +251,6 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
}
#if defined(COLS_B)
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
- *
- * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
- *
- * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8
- * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported formats: same as @p src0_ptr
- * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported formats: same as @p src0_ptr
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in] a_offset Offset to be added to each element of the matrix A
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] c_offset Offset to be added to each element of the matrix C.
- * @param[in] c_mult_int Multiplied with each element of the matrix C.
- * @param[in] shift Number of bits to shift right the result.
- */
-__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst),
- int a_offset,
- int b_offset,
- int c_offset,
- int c_mult_int,
- int shift)
-{
- // src_addr.s0 = address of matrix A
- // src_addr.s1 = address of matrix B
-
- // Compute address for matrix A and B
- int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
- (src1_stride_y));
-
- // Add offset_first_element_in_bytes
- src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
- // Compute end row address for matrix B
- int end_row_mtx_b = src_addr.s1 + COLS_B;
-
- // Reset accumulators
- int16 c00 = 0.0f;
- int16 c10 = 0.0f;
- int16 c20 = 0.0f;
- int16 c30 = 0.0f;
-
- for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
- {
- // Load values from matrix A (interleaved) and matrix B (transposed)
- int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
- int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
- c00 += (int16)a0.s0 * b0;
- c10 += (int16)a0.s1 * b0;
- c20 += (int16)a0.s2 * b0;
- c30 += (int16)a0.s3 * b0;
-
- int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
-
- c00 += (int16)a0.s4 * b1;
- c10 += (int16)a0.s5 * b1;
- c20 += (int16)a0.s6 * b1;
- c30 += (int16)a0.s7 * b1;
- }
-
- for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
- {
- // Load values from matrix A (interleaved) and matrix B (transposed)
- int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
- int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
- c00 += (int16)a0.s0 * b0;
- c10 += (int16)a0.s1 * b0;
- c20 += (int16)a0.s2 * b0;
- c30 += (int16)a0.s3 * b0;
- }
-
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
- // Multiply by the weight of matrix product
- c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
- c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
- c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
- c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
-
- // Store 4x16 block
- vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
- vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
- vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
- vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
-}
-
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
*
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
new file mode 100644
index 0000000000..7cd0c0b8db
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -0,0 +1,540 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(COLS_B)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
+ *
+ * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemmlowp_mm_interleaved_transposed(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
+{
+ // src_addr.s0 = address of matrix A
+ // src_addr.s1 = address of matrix B
+ // Compute address for matrix A and B
+ int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+ (src1_stride_y));
+
+ // Add offset_first_element_in_bytes
+ src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Compute end row address for matrix B
+ int end_row_mtx_b = src_addr.s1 + COLS_B;
+
+ // Reset accumulators
+ int16 c00 = 0;
+ int16 c10 = 0;
+ int16 c20 = 0;
+ int16 c30 = 0;
+
+ for(; src_addr.s1 <= (end_row_mtx_b - 32); src_addr += (int2)(8, 32))
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ int8 a0 = convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+ int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+ c00 += (int16)a0.s0 * b0;
+ c10 += (int16)a0.s1 * b0;
+ c20 += (int16)a0.s2 * b0;
+ c30 += (int16)a0.s3 * b0;
+
+ int16 b1 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
+
+ c00 += (int16)a0.s4 * b1;
+ c10 += (int16)a0.s5 * b1;
+ c20 += (int16)a0.s6 * b1;
+ c30 += (int16)a0.s7 * b1;
+ }
+
+ for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ int4 a0 = convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+ int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+ c00 += (int16)a0.s0 * b0;
+ c10 += (int16)a0.s1 * b0;
+ c20 += (int16)a0.s2 * b0;
+ c30 += (int16)a0.s3 * b0;
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Store 4x16 block
+ vstore16(c00, 0, (__global int *)(offset(&dst, 0, 0)));
+ vstore16(c10, 0, (__global int *)(offset(&dst, 0, 1)));
+ vstore16(c20, 0, (__global int *)(offset(&dst, 0, 2)));
+ vstore16(c30, 0, (__global int *)(offset(&dst, 0, 3)));
+}
+#endif // defined(COLS_B)
+
+#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
+#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemmlowp_mm(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx;
+
+ int end_row_vec_a = src_addr.s0 + COLS_A;
+
+ VECTOR_UINT acc0 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ VECTOR_UINT acc1 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ VECTOR_UINT acc2 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ VECTOR_UINT acc3 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
+ {
+ // Load values from matrix A
+ uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ // Load values from matrix B
+ VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
+ VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
+
+ // Accumulate
+ acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
+ acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
+ acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
+ acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
+ acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+ {
+ // Load values from matrix A
+ uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ // Load values from matrix B
+ VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
+
+ // Accumulate
+ acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Store the result
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+ (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
+
+#if defined(COLS_A)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ uint4 sum_row_u32 = (uint4)0;
+ uint sum_row = 0;
+
+ __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+ int i = 0;
+
+ // This for loop performs 16 accumulations
+ for(; i <= ((int)COLS_A - 16); i += 16)
+ {
+ const uchar16 a0_u8 = vload16(0, matrix_a + i);
+
+ sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
+ }
+
+ // This for loop performs the leftover accumulations
+ for(; i < COLS_A; ++i)
+ {
+ sum_row += matrix_a[i];
+ }
+
+ sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
+
+ *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(COLS_A)
+
+#if defined(COLS_B) && defined(ROWS_B)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ uint16 sum_col_u32 = (uint16)0;
+
+ __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
+
+ int i = 0;
+ // This for loop performs 4 accumulations
+ for(; i <= ((int)ROWS_B - 4); i += 4)
+ {
+ const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
+ const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
+ const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
+ const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
+
+ sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
+
+ matrix_b += 4 * src_stride_y;
+ }
+
+ // This for loop perfoms the leftover accumulations
+ for(; i < (int)ROWS_B; ++i)
+ {
+ const uchar16 b0_u8 = vload16(0, matrix_b);
+
+ sum_col_u32 += convert_uint16(b0_u8);
+
+ matrix_b += src_stride_y;
+ }
+
+ vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
+}
+#endif // defined(COLS_B) && defined(ROWS_B)
+
+#if defined(K_OFFSET)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ * (sum_col[k] * A_OFFSET) +
+ * (sum_row[i] * B_OFFSET) +
+ * (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+ )
+{
+ Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
+
+ int16 a_offset_s32 = (int16)0;
+ int16 b_offset_s32 = (int16)0;
+
+#if defined(A_OFFSET)
+ Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
+
+ // Compute the offset contribution due to A_OFFSET
+ a_offset_s32 = vload16(0, (__global int *)sum_col.ptr + get_global_id(2) * sum_col_stride_y);
+ a_offset_s32 *= (int16)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+ Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
+
+ // Compute the offset contribution due to B_OFFSET
+ b_offset_s32 = (int16) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
+ b_offset_s32 *= (int16)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+ const int16 offset_term_s32 = (int16)K_OFFSET + a_offset_s32 + b_offset_s32;
+
+ int16 in_s32 = vload16(0, (__global int *)mm_result.ptr);
+
+ // Add the offset terms to GEMM's result
+ in_s32 += offset_term_s32;
+
+ // Store the result with the offset contribution
+ vstore16(in_s32, 0, (__global int *)mm_result.ptr);
+}
+#endif // defined(K_OFFSET)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ * These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+ TENSOR3D_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#if defined(ADD_BIAS)
+ Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+#endif // defined(ADD_BIAS)
+
+ int16 input_values = vload16(0, (__global int *)src.ptr);
+
+ // Add the offset terms to GEMM's result
+ input_values += (int16)RESULT_OFFSET;
+
+ // Multiply by result_mult_int
+ input_values *= (int16)RESULT_MULT_INT;
+
+#if defined(ADD_BIAS)
+ // Add bias
+ const int16 biases_values = vload16(0, (__global int *)biases.ptr);
+ input_values += (int16)biases_values;
+#endif // defined(ADD_BIAS)
+
+ // Shift final result
+ input_values >>= RESULT_SHIFT;
+
+ // Saturate negative values
+ input_values = max(input_values, (int16)0);
+
+ uchar16 res = convert_uchar16_sat(input_values);
+
+#if defined(MIN_BOUND)
+ res = max(res, (uchar16)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (uchar16)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore16(res, 0, dst.ptr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index ef572cfc7e..b3227c0db9 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -51,45 +51,88 @@ CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
{
}
-void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
- int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+ if(!is_interleaved_transposed)
+ {
+ ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+ }
+
+ TensorShape in1_shape = input1->info()->tensor_shape();
+ in1_shape.collapse(2);
_input0 = input0;
_input1 = input1;
_output = output;
- // Create kernel and set static arguments
- std::set<std::string> build_opts = { ("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))) };
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_u8", build_opts));
- unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- _kernel.setArg<int32_t>(idx++, a_offset);
- _kernel.setArg<int32_t>(idx++, b_offset);
- _kernel.setArg<int32_t>(idx++, output_offset);
- _kernel.setArg<int32_t>(idx++, output_mult_int);
- _kernel.setArg<int32_t>(idx++, shift);
+ CLBuildOptions build_opts;
- // Configure window
- constexpr unsigned int num_elems_processed_per_iteration_x = 16;
- constexpr unsigned int num_elems_processed_per_iteration_y = 4;
- constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
- constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
+ if(is_interleaved_transposed)
+ {
+ // Create kernel and set static arguments
+ build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm_interleaved_transposed", build_opts.options()));
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ // Configure window
+ constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+ constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+ constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
+ constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
- AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
- AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- update_window_and_padding(win, input0_access, input1_access, output_access);
+ AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
+ AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+ }
+ else
+ {
+ // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
+ constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+ const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+
+ build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+ build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
+ build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y));
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm", build_opts.options()));
+
+ // Configure window
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
+ AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+ }
- ICLKernel::configure(win);
+ // Set config_id for enabling LWS tuning
+ _config_id = "gemmlowp_";
+ _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+ _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
}
void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -117,7 +160,7 @@ void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue
add_2D_tensor_argument(idx, _input0, slice);
add_2D_tensor_argument(idx, _input1, slice_b);
add_2D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
+ enqueue(queue, *this, slice, _lws_hint);
}
while(window.slide_window_slice_2D(slice));
}
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
new file mode 100644
index 0000000000..96919fe3cb
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
+ : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr)
+{
+}
+
+void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if(a_offset != 0)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0));
+
+ TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+ vector_sum_col_shape.collapse(1);
+
+ build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+ }
+
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ if(b_offset != 0)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1));
+
+ TensorShape output_shape = mm_result->info()->tensor_shape();
+ TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape();
+ vector_sum_row_shape.collapse(1);
+ output_shape.collapse(2);
+
+ ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+
+ if(a_offset != 0)
+ {
+ TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+ vector_sum_col_shape.collapse(1);
+
+ ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
+ && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+ }
+
+ build_opts.add_option("-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+ }
+
+ build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options()));
+
+ _vector_sum_col = vector_sum_col;
+ _vector_sum_row = vector_sum_row;
+ _mm_result = mm_result;
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, mm_result_access);
+
+ if(a_offset != 0)
+ {
+ AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
+ update_window_and_padding(win, vector_sum_col_access);
+ }
+
+ if(b_offset != 0)
+ {
+ AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
+ update_window_and_padding(win, vector_sum_row_access);
+ }
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ // Set window for vector_sum_col
+ Window win_vector_sum_col = slice;
+ win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ // Set window for vector_sum_row
+ Window win_vector_sum_row = slice;
+ win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _mm_result, slice);
+ if(_vector_sum_col != nullptr)
+ {
+ add_2D_tensor_argument(idx, _vector_sum_col, win_vector_sum_col);
+ }
+ if(_vector_sum_row != nullptr)
+ {
+ add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row);
+ }
+ enqueue(queue, *this, slice);
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
new file mode 100644
index 0000000000..fa6a48e77c
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel()
+ : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min,
+ int max)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON(max > 255);
+ ARM_COMPUTE_ERROR_ON(min < 0 || min > max);
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0));
+ }
+
+ _input = input;
+ _bias = bias;
+ _output = output;
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(result_offset));
+ build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(result_mult_int));
+ build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
+ build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+ build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+ build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down", build_opts.options()));
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ input_access,
+ output_result_access);
+
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]);
+
+ update_window_and_padding(win,
+ bias_access);
+ }
+
+ output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ unsigned int idx1 = num_arguments_per_3D_tensor();
+ if(_bias != nullptr)
+ {
+ Window biases_slice(slice);
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+ biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+ add_1D_tensor_argument(idx1, _bias, biases_slice);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx1, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(collapsed.slide_window_slice_3D(slice));
+} \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
new file mode 100644
index 0000000000..6f410d3b14
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
+ : _input(), _output()
+{
+}
+
+void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+ _input = mtx_a;
+ _output = vector_sum_row;
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options()));
+
+ const unsigned int num_elems_processed_per_iteration = 1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
+ AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ input_access,
+ output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+ Window slice_in = collapsed.first_slice_window_2D();
+ Window slice_out = collapsed.first_slice_window_2D();
+
+ // Setup input slice. Its dimensions are increased in the cl kernel.
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_2D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ }
+ while(collapsed.slide_window_slice_2D(slice_out));
+}
+
+void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+
+ _input = mtx_b;
+ _output = vector_sum_col;
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0)));
+ build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_b_reduction", build_opts.options()));
+
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
+ AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ input_access,
+ output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
+
+ Window slice_out = collapsed.first_slice_window_2D();
+ Window slice_in = slice_out;
+
+ slice_in.set(Window::DimY, Window::Dimension(0, 1, 1));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_2D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ }
+ while(collapsed.slide_window_slice_2D(slice_out));
+}
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index a8395a15cb..81094f8743 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -209,7 +209,7 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
uint32x4_t sum_row_u32 = vdupq_n_u32(0);
uint32_t sum_row = 0;
- const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + +id.y() * _input->info()->strides_in_bytes()[2]);
+ const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
deleted file mode 100644
index db6d11c2c3..0000000000
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
-{
-}
-
-void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
-
- // Create shape for interleaved temporary tensor
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, ceil(a->info()->dimension(1) / 4));
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- _tmp_a.allocator()->init(info_a);
-
- // Create shape for tranposed temporary tensor
- TensorShape shape_tmp_b = b->info()->tensor_shape();
- shape_tmp_b.set(0, b->info()->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(static_cast<float>(b->info()->dimension(0)) / 16));
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
- _tmp_b.allocator()->init(info_b);
-
- // Manage intermediate buffers
- _memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
-
- // Configure kernels
- _interleave_kernel.configure(a, &_tmp_a);
- _transpose_kernel.configure(b, &_tmp_b);
- _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
-
- // Allocate intermediate buffers
- _tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
-}
-
-void CLGEMMLowp::run()
-{
- _memory_group.acquire();
-
- /* Run interleave kernel */
- CLScheduler::get().enqueue(_interleave_kernel, false);
-
- /* Run transpose kernel */
- CLScheduler::get().enqueue(_transpose_kernel, false);
-
- /* Run matrix multiply kernel */
- CLScheduler::get().enqueue(_mm_kernel, false);
-
- _memory_group.release();
-}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000000..5d2d13e243
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+ _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+
+ _a_offset = a->info()->quantization_info().offset;
+ _b_offset = b->info()->quantization_info().offset;
+
+ // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors
+ _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
+ const ICLTensor *matrix_a = a;
+ const ICLTensor *matrix_b = b;
+
+ if(_is_interleaved_transposed)
+ {
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->info()->tensor_shape();
+ shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->info()->tensor_shape();
+ shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ _tmp_a.allocator()->init(info_a);
+ _tmp_b.allocator()->init(info_b);
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
+ // Configure interleave kernel
+ _mtx_a_reshape_kernel.configure(a, &_tmp_a);
+
+ // Configure transpose kernel
+ _mtx_b_reshape_kernel.configure(b, &_tmp_b);
+ }
+
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed);
+
+ // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ TensorShape shape_vector_sum_col = b->info()->tensor_shape();
+ if(b->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_col.remove_dimension(1);
+ }
+ TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+ _vector_sum_col.allocator()->init(info_vector_sum_col);
+ _memory_group.manage(&_vector_sum_col);
+
+ // Configure Matrix B reduction kernel
+ _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
+ }
+
+ // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ TensorShape shape_vector_sum_row = a->info()->tensor_shape();
+ shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
+ if(a->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_row.remove_dimension(1);
+ }
+ TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+ _vector_sum_row.allocator()->init(info_vector_sum_row);
+ _memory_group.manage(&_vector_sum_row);
+
+ // Configure matrix A reduction kernel
+ _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
+ }
+
+ // Configure offset contribution kernel
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+
+ // Allocate tensors
+ if(_is_interleaved_transposed)
+ {
+ _tmp_a.allocator()->allocate();
+ _tmp_b.allocator()->allocate();
+ }
+
+ if(_a_offset != 0)
+ {
+ _vector_sum_col.allocator()->allocate();
+ }
+
+ if(_b_offset != 0)
+ {
+ _vector_sum_row.allocator()->allocate();
+ }
+}
+
+void CLGEMMLowpMatrixMultiplyCore::run()
+{
+ _memory_group.acquire();
+
+ if(_is_interleaved_transposed)
+ {
+ // Run reshape matrix A
+ CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
+
+ // Run reshape matrix B
+ CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+ }
+
+ // Run matrix multiply
+ CLScheduler::get().enqueue(_mm_kernel, false);
+
+ // Run matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+ }
+
+ // Run offset contribution kernel
+ CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+
+ _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
new file mode 100644
index 0000000000..b1d620d8a2
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
+ k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
+ _kernel = std::move(k);
+} \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 92c911c370..da5ac22fdc 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -133,7 +133,10 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
if(_a_offset != 0)
{
TensorShape shape_vector_sum_col = b->info()->tensor_shape();
- shape_vector_sum_col.remove_dimension(1);
+ if(b->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_col.remove_dimension(1);
+ }
TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
_vector_sum_col.allocator()->init(info_vector_sum_col);
_memory_group.manage(&_vector_sum_col);
@@ -147,7 +150,10 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
{
TensorShape shape_vector_sum_row = a->info()->tensor_shape();
shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
- shape_vector_sum_row.remove_dimension(1);
+ if(a->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_row.remove_dimension(1);
+ }
TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
_vector_sum_row.allocator()->init(info_vector_sum_row);
_memory_group.manage(&_vector_sum_row);
diff --git a/tests/datasets/LargeGEMMLowpDataset.h b/tests/datasets/LargeGEMMLowpDataset.h
index cc1feb49a2..87f879e70a 100644
--- a/tests/datasets/LargeGEMMLowpDataset.h
+++ b/tests/datasets/LargeGEMMLowpDataset.h
@@ -42,7 +42,9 @@ class LargeGEMMLowpDataset final : public GEMMLowpDataset
public:
LargeGEMMLowpDataset()
{
+ add_config(TensorShape(923U, 2U), TensorShape(871U, 923U), TensorShape(871U, 2U), 0, 0);
add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0);
+ add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3);
add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4);
add_config(TensorShape(697U, 872U), TensorShape(563U, 697U), TensorShape(563U, 872U), -2, 0);
add_config(TensorShape(1021U, 973U), TensorShape(783U, 1021U), TensorShape(783U, 973U), 5, 13);
diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h
index 881546e70f..1d4ab53be5 100644
--- a/tests/datasets/SmallGEMMLowpDataset.h
+++ b/tests/datasets/SmallGEMMLowpDataset.h
@@ -42,11 +42,13 @@ class SmallGEMMLowpDataset final : public GEMMLowpDataset
public:
SmallGEMMLowpDataset()
{
+ add_config(TensorShape(21U, 2U), TensorShape(43U, 21U), TensorShape(43U, 2U), 0, 0);
add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0);
+ add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13);
add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4);
add_config(TensorShape(52U, 26U), TensorShape(33U, 52U), TensorShape(33U, 26U), -2, 0);
add_config(TensorShape(31U, 27U), TensorShape(23U, 31U), TensorShape(23U, 27U), 5, 13);
- add_config(TensorShape(38U, 12U), TensorShape(21U, 38U), TensorShape(21U, 12U), -3, -2);
+ add_config(TensorShape(38U, 43U), TensorShape(21U, 38U), TensorShape(21U, 43U), -3, -2);
add_config(TensorShape(32U, 72U), TensorShape(17U, 32U), TensorShape(17U, 72U), -9, 1);
}
};
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
new file mode 100644
index 0000000000..1968efcedc
--- /dev/null
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/LargeGEMMLowpDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SmallGEMMLowpDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/GEMMLowpFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(GEMMLowp)
+
+TEST_SUITE(MatrixMultiplyCore)
+using CLGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
+ shape_a, shape_b, shape_c, a_offset, b_offset)
+{
+ // Create tensors
+ CLTensor a = create_tensor<CLTensor>(shape_a, DataType::QASYMM8);
+ CLTensor b = create_tensor<CLTensor>(shape_b, DataType::QASYMM8);
+ CLTensor c = create_tensor<CLTensor>(shape_c, DataType::S32);
+
+ a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
+ b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
+
+ ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Create and configure function
+ CLGEMMLowpMatrixMultiplyCore gemmlowp_mm;
+ gemmlowp_mm.configure(&a, &b, &c);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE_END() // MatrixMultiplyCore
+
+TEST_SUITE(OutputStage)
+TEST_SUITE(QuantizeDownInt32ToUint8Scale)
+
+const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
+ 3)
+ * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
+
+const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
+ 2)
+ * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 173) * framework::dataset::make("addBias", { false, true });
+
+using CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToUint8Scale>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()), quantize_down_int32_to_uint8_scale_cases),
+ shape, result_offset, result_mult_int, result_shift, min, max, add_bias)
+{
+ TensorShape shape_bias(shape[0]);
+
+ // Create tensors
+ CLTensor in = create_tensor<CLTensor>(shape, DataType::S32);
+ CLTensor bias = create_tensor<CLTensor>(shape_bias, DataType::S32);
+ CLTensor out = create_tensor<CLTensor>(shape, DataType::QASYMM8);
+
+ ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Create and configure function
+ CLGEMMLowpQuantizeDownInt32ToUint8Scale output_stage;
+ output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_offset, result_mult_int, result_shift, min, max);
+
+ // Validate valid region input and output
+ const ValidRegion valid_region = shape_to_valid_region(shape);
+ validate(in.info()->valid_region(), valid_region);
+ validate(out.info()->valid_region(), valid_region);
+
+ // Validate valid region bias
+ if(add_bias)
+ {
+ const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
+ validate(bias.info()->valid_region(), valid_region_bias);
+ }
+
+ // Validate padding
+ const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
+ validate(in.info()->padding(), padding);
+ validate(out.info()->padding(), padding);
+
+ if(add_bias)
+ {
+ validate(bias.info()->padding(), padding);
+ }
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_cases))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE(BoundedReLu)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // BoundedReLu
+
+TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale
+TEST_SUITE_END() // OutputStage
+TEST_SUITE_END() // GEMMLowp
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute \ No newline at end of file
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 6366223820..6d13fdc939 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -177,11 +177,11 @@ TEST_SUITE(OutputStage)
TEST_SUITE(QuantizeDownInt32ToUint8Scale)
-const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
+const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
3)
* framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
-const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1,
+const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 1) * framework::dataset::make("result_mult_int", 1,
2)
* framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
@@ -254,20 +254,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture,
}
TEST_SUITE_END() // BoundedReLu
-TEST_SUITE(AddBias)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
-{
- // Validate output
- validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
-{
- // Validate output
- validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // AddBias
-
TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale
TEST_SUITE_END() // OutputStage