aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2017-11-21 10:57:50 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:41:17 +0000
commit05288a2b871ef99f544771621c3bba409b2f70df (patch)
tree21e3d2a9927ef31f6d5bcdd5523c4c8e933047a6 /arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
parentc82799003fbfdc5bb9526ff944e41eaae23e3f03 (diff)
downloadComputeLibrary-05288a2b871ef99f544771621c3bba409b2f70df.tar.gz
COMPMID-697 - Rework GEMMLowp interface on OpenCL
Reworked the interface of GemmLowp in order to make easy the integration in Android NN - Added support for different output stage - Added validation for both matrix multiplication and output stage - Added bounded relu support in the output stage - Added in32_t bias support - Added optimized path for vector by matrix case This rework is required for: - Convolution quantized - Fully connected quantized Change-Id: I512283d406099cf8c614dd89d0a97ed411143afc Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110625 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Diffstat (limited to 'arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h')
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h31
1 files changed, 12 insertions, 19 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
index 05956aeeba..b60b80618c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
@@ -30,15 +30,15 @@ namespace arm_compute
{
class ICLTensor;
-/** OpenCL kernel to compute low precision matrix multiplication kernel
+/** OpenCL kernel to multiply matrices
*
+ * @note @ref CLGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
* This kernel performs the following computation:
- * -# Convert a values from uint8 to int32 and add a_offset to each of them.
- * -# Convert b values from uint8 to int32 and add b_offset to each of them.
- * -# Compute the int32 matrix product of the resulting a * b.
- * -# Add output_offset to each entry of the result.
- * -# Multiply each entry of the result and round to the nearest integer
- * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ *
+ * -# Convert a values from int8 to int32
+ * -# Convert b values from int8 to int32
+ * -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+ *
*/
class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
{
@@ -55,19 +55,12 @@ public:
CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
/** Initialise the kernel's input and output.
*
- * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel.
- * These two kernels change the layout of the original matrices to be more cache-friendly.
- *
- * @param[in] input0 Input tensor containing the interleaved Matrix A. Data types supported: U8
- * @param[in] input1 Input tensor containing the transposed Matrix B. Data types supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication, Data types supported: same as @p input0
- * @param[in] a_offset Offset to be added to each element of the matrix A.
- * @param[in] b_offset Offset to be added to each element of the matrix B.
- * @param[in] output_offset Offset to be added to each element of the output matrix
- * @param[in] output_mult_int Offset to be added to each element of the output matrix
- * @param[in] shift Number of bits to shift right the result.
+ * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
+ * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
+ * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
+ * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
*/
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+ void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;